From 8d412f9eeaa005c56ef80b85ba4fd212632907c4 Mon Sep 17 00:00:00 2001 From: DrKLO Date: Sun, 14 Sep 2014 03:40:36 +0400 Subject: [PATCH] Updated to 1.8.3, video compression with parallel upload to server (Android 4.3+ only now, disabled) --- TMessagesProj/build.gradle | 4 +- TMessagesProj/jni/Android.mk | 3 +- TMessagesProj/jni/fake.c | 2 +- TMessagesProj/jni/image.c | 16 +- TMessagesProj/jni/libyuv/include/libyuv.h | 33 + .../jni/libyuv/include/libyuv/basic_types.h | 118 + .../jni/libyuv/include/libyuv/compare.h | 73 + .../jni/libyuv/include/libyuv/convert.h | 254 + .../jni/libyuv/include/libyuv/convert_argb.h | 225 + .../jni/libyuv/include/libyuv/convert_from.h | 173 + .../libyuv/include/libyuv/convert_from_argb.h | 166 + .../jni/libyuv/include/libyuv/cpu_id.h | 81 + .../libyuv/include/libyuv/format_conversion.h | 168 + .../jni/libyuv/include/libyuv/mjpeg_decoder.h | 192 + .../libyuv/include/libyuv/planar_functions.h | 439 + .../jni/libyuv/include/libyuv/rotate.h | 117 + .../jni/libyuv/include/libyuv/rotate_argb.h | 33 + TMessagesProj/jni/libyuv/include/libyuv/row.h | 1821 ++++ .../jni/libyuv/include/libyuv/scale.h | 102 + .../jni/libyuv/include/libyuv/scale_argb.h | 57 + .../jni/libyuv/include/libyuv/scale_row.h | 349 + .../jni/libyuv/include/libyuv/version.h | 16 + .../jni/libyuv/include/libyuv/video_common.h | 182 + TMessagesProj/jni/libyuv/source/compare.cc | 325 + .../jni/libyuv/source/compare_common.cc | 42 + .../jni/libyuv/source/compare_neon.cc | 103 + .../jni/libyuv/source/compare_posix.cc | 158 + .../jni/libyuv/source/compare_win.cc | 232 + TMessagesProj/jni/libyuv/source/convert.cc | 1543 ++++ .../jni/libyuv/source/convert_argb.cc | 938 +++ .../jni/libyuv/source/convert_from.cc | 1210 +++ .../jni/libyuv/source/convert_from_argb.cc | 1133 +++ .../jni/libyuv/source/convert_jpeg.cc | 392 + .../jni/libyuv/source/convert_to_argb.cc | 327 + .../jni/libyuv/source/convert_to_i420.cc | 383 + TMessagesProj/jni/libyuv/source/cpu_id.cc | 293 + .../jni/libyuv/source/format_conversion.cc | 554 ++ .../jni/libyuv/source/mjpeg_decoder.cc | 566 ++ .../jni/libyuv/source/mjpeg_validate.cc | 47 + .../jni/libyuv/source/planar_functions.cc | 2291 +++++ TMessagesProj/jni/libyuv/source/rotate.cc | 1315 +++ .../jni/libyuv/source/rotate_argb.cc | 209 + .../jni/libyuv/source/rotate_mips.cc | 485 ++ .../jni/libyuv/source/rotate_neon.cc | 533 ++ .../jni/libyuv/source/rotate_neon64.cc | 540 ++ TMessagesProj/jni/libyuv/source/row_any.cc | 602 ++ TMessagesProj/jni/libyuv/source/row_common.cc | 2286 +++++ TMessagesProj/jni/libyuv/source/row_mips.cc | 994 +++ TMessagesProj/jni/libyuv/source/row_neon.cc | 3148 +++++++ TMessagesProj/jni/libyuv/source/row_neon64.cc | 3327 ++++++++ TMessagesProj/jni/libyuv/source/row_posix.cc | 6443 ++++++++++++++ TMessagesProj/jni/libyuv/source/row_win.cc | 7402 +++++++++++++++++ TMessagesProj/jni/libyuv/source/row_x86.asm | 146 + TMessagesProj/jni/libyuv/source/scale.cc | 1716 ++++ TMessagesProj/jni/libyuv/source/scale_argb.cc | 809 ++ .../jni/libyuv/source/scale_common.cc | 1165 +++ TMessagesProj/jni/libyuv/source/scale_mips.cc | 654 ++ TMessagesProj/jni/libyuv/source/scale_neon.cc | 764 ++ .../jni/libyuv/source/scale_neon64.cc | 789 ++ .../jni/libyuv/source/scale_posix.cc | 1315 +++ TMessagesProj/jni/libyuv/source/scale_win.cc | 1320 +++ .../jni/libyuv/source/video_common.cc | 64 + TMessagesProj/jni/libyuv/source/x86inc.asm | 1136 +++ .../libs/armeabi-v7a/libtmessages.so | Bin 856740 -> 864932 bytes TMessagesProj/libs/armeabi/libtmessages.so | Bin 803472 -> 811664 bytes TMessagesProj/libs/x86/libtmessages.so | Bin 1250356 -> 1262644 bytes TMessagesProj/src/main/AndroidManifest.xml | 2 +- .../org/telegram/android/ImageLoader.java | 121 +- .../telegram/android/LocaleController.java | 6 +- .../org/telegram/android/MediaController.java | 17 +- .../org/telegram/android/MessageObject.java | 19 +- .../telegram/android/MessagesController.java | 7 +- .../org/telegram/android/MessagesStorage.java | 54 +- .../org/telegram/android/NativeLoader.java | 6 +- .../telegram/android/NotificationCenter.java | 2 +- .../android/NotificationsController.java | 11 +- .../org/telegram/android/PhotoObject.java | 19 +- .../telegram/android/SendMessagesHelper.java | 588 +- .../telegram/android/video/InputSurface.java | 135 + .../telegram/android/video/MP4Builder.java | 430 + .../org/telegram/android/video/Mp4Movie.java | 81 + .../telegram/android/video/OutputSurface.java | 180 + .../org/telegram/android/video/Sample.java | 27 + .../android/video/TextureRenderer.java | 200 + .../org/telegram/android/video/Track.java | 247 + .../messenger/ConnectionsManager.java | 7 +- .../org/telegram/messenger/FileLoader.java | 23 +- .../messenger/FileUploadOperation.java | 74 +- .../java/org/telegram/messenger/TLRPC.java | 6 +- .../org/telegram/messenger/Utilities.java | 2 +- .../org/telegram/ui/Cells/ChatBaseCell.java | 2 +- .../org/telegram/ui/Cells/ChatMediaCell.java | 1 - .../org/telegram/ui/Cells/DialogCell.java | 23 +- .../java/org/telegram/ui/ChatActivity.java | 78 +- .../org/telegram/ui/ContactsActivity.java | 2 +- .../java/org/telegram/ui/LaunchActivity.java | 9 +- .../telegram/ui/LoginActivityPhoneView.java | 10 +- .../org/telegram/ui/LoginActivitySmsView.java | 1 - .../ui/PopupNotificationActivity.java | 4 +- .../org/telegram/ui/VideoEditorActivity.java | 489 +- .../ui/Views/ActionBar/ActionBarActivity.java | 3 + .../ui/Views/ChatActivityEnterView.java | 11 +- .../ui/Views/SizeNotifierRelativeLayout.java | 7 +- .../src/main/res/values-de/strings.xml | 2 +- .../src/main/res/values-nl/strings.xml | 2 +- 105 files changed, 54738 insertions(+), 493 deletions(-) create mode 100644 TMessagesProj/jni/libyuv/include/libyuv.h create mode 100644 TMessagesProj/jni/libyuv/include/libyuv/basic_types.h create mode 100644 TMessagesProj/jni/libyuv/include/libyuv/compare.h create mode 100644 TMessagesProj/jni/libyuv/include/libyuv/convert.h create mode 100644 TMessagesProj/jni/libyuv/include/libyuv/convert_argb.h create mode 100644 TMessagesProj/jni/libyuv/include/libyuv/convert_from.h create mode 100644 TMessagesProj/jni/libyuv/include/libyuv/convert_from_argb.h create mode 100644 TMessagesProj/jni/libyuv/include/libyuv/cpu_id.h create mode 100644 TMessagesProj/jni/libyuv/include/libyuv/format_conversion.h create mode 100644 TMessagesProj/jni/libyuv/include/libyuv/mjpeg_decoder.h create mode 100644 TMessagesProj/jni/libyuv/include/libyuv/planar_functions.h create mode 100644 TMessagesProj/jni/libyuv/include/libyuv/rotate.h create mode 100644 TMessagesProj/jni/libyuv/include/libyuv/rotate_argb.h create mode 100644 TMessagesProj/jni/libyuv/include/libyuv/row.h create mode 100644 TMessagesProj/jni/libyuv/include/libyuv/scale.h create mode 100644 TMessagesProj/jni/libyuv/include/libyuv/scale_argb.h create mode 100644 TMessagesProj/jni/libyuv/include/libyuv/scale_row.h create mode 100644 TMessagesProj/jni/libyuv/include/libyuv/version.h create mode 100644 TMessagesProj/jni/libyuv/include/libyuv/video_common.h create mode 100644 TMessagesProj/jni/libyuv/source/compare.cc create mode 100644 TMessagesProj/jni/libyuv/source/compare_common.cc create mode 100644 TMessagesProj/jni/libyuv/source/compare_neon.cc create mode 100644 TMessagesProj/jni/libyuv/source/compare_posix.cc create mode 100644 TMessagesProj/jni/libyuv/source/compare_win.cc create mode 100644 TMessagesProj/jni/libyuv/source/convert.cc create mode 100644 TMessagesProj/jni/libyuv/source/convert_argb.cc create mode 100644 TMessagesProj/jni/libyuv/source/convert_from.cc create mode 100644 TMessagesProj/jni/libyuv/source/convert_from_argb.cc create mode 100644 TMessagesProj/jni/libyuv/source/convert_jpeg.cc create mode 100644 TMessagesProj/jni/libyuv/source/convert_to_argb.cc create mode 100644 TMessagesProj/jni/libyuv/source/convert_to_i420.cc create mode 100644 TMessagesProj/jni/libyuv/source/cpu_id.cc create mode 100644 TMessagesProj/jni/libyuv/source/format_conversion.cc create mode 100644 TMessagesProj/jni/libyuv/source/mjpeg_decoder.cc create mode 100644 TMessagesProj/jni/libyuv/source/mjpeg_validate.cc create mode 100644 TMessagesProj/jni/libyuv/source/planar_functions.cc create mode 100644 TMessagesProj/jni/libyuv/source/rotate.cc create mode 100644 TMessagesProj/jni/libyuv/source/rotate_argb.cc create mode 100644 TMessagesProj/jni/libyuv/source/rotate_mips.cc create mode 100644 TMessagesProj/jni/libyuv/source/rotate_neon.cc create mode 100644 TMessagesProj/jni/libyuv/source/rotate_neon64.cc create mode 100644 TMessagesProj/jni/libyuv/source/row_any.cc create mode 100644 TMessagesProj/jni/libyuv/source/row_common.cc create mode 100644 TMessagesProj/jni/libyuv/source/row_mips.cc create mode 100644 TMessagesProj/jni/libyuv/source/row_neon.cc create mode 100644 TMessagesProj/jni/libyuv/source/row_neon64.cc create mode 100644 TMessagesProj/jni/libyuv/source/row_posix.cc create mode 100644 TMessagesProj/jni/libyuv/source/row_win.cc create mode 100644 TMessagesProj/jni/libyuv/source/row_x86.asm create mode 100644 TMessagesProj/jni/libyuv/source/scale.cc create mode 100644 TMessagesProj/jni/libyuv/source/scale_argb.cc create mode 100644 TMessagesProj/jni/libyuv/source/scale_common.cc create mode 100644 TMessagesProj/jni/libyuv/source/scale_mips.cc create mode 100644 TMessagesProj/jni/libyuv/source/scale_neon.cc create mode 100644 TMessagesProj/jni/libyuv/source/scale_neon64.cc create mode 100644 TMessagesProj/jni/libyuv/source/scale_posix.cc create mode 100644 TMessagesProj/jni/libyuv/source/scale_win.cc create mode 100644 TMessagesProj/jni/libyuv/source/video_common.cc create mode 100644 TMessagesProj/jni/libyuv/source/x86inc.asm create mode 100644 TMessagesProj/src/main/java/org/telegram/android/video/InputSurface.java create mode 100644 TMessagesProj/src/main/java/org/telegram/android/video/MP4Builder.java create mode 100644 TMessagesProj/src/main/java/org/telegram/android/video/Mp4Movie.java create mode 100644 TMessagesProj/src/main/java/org/telegram/android/video/OutputSurface.java create mode 100644 TMessagesProj/src/main/java/org/telegram/android/video/Sample.java create mode 100644 TMessagesProj/src/main/java/org/telegram/android/video/TextureRenderer.java create mode 100644 TMessagesProj/src/main/java/org/telegram/android/video/Track.java diff --git a/TMessagesProj/build.gradle b/TMessagesProj/build.gradle index 7e1713307..bbfc4d219 100644 --- a/TMessagesProj/build.gradle +++ b/TMessagesProj/build.gradle @@ -80,7 +80,7 @@ android { defaultConfig { minSdkVersion 8 targetSdkVersion 19 - versionCode 320 - versionName "1.8.0" + versionCode 326 + versionName "1.8.3" } } diff --git a/TMessagesProj/jni/Android.mk b/TMessagesProj/jni/Android.mk index 1a1481668..87fffeb51 100755 --- a/TMessagesProj/jni/Android.mk +++ b/TMessagesProj/jni/Android.mk @@ -175,7 +175,8 @@ LOCAL_C_INCLUDES := \ ./opus/silk/fixed \ ./opus/celt \ ./opus/ \ -./opus/opusfile +./opus/opusfile \ +./libyuv/include LOCAL_SRC_FILES += \ ./libjpeg/jcapimin.c \ diff --git a/TMessagesProj/jni/fake.c b/TMessagesProj/jni/fake.c index b484b63f0..bd08db5a1 100644 --- a/TMessagesProj/jni/fake.c +++ b/TMessagesProj/jni/fake.c @@ -2,5 +2,5 @@ void fakeFunction() { printf("some androids has buggy native loader, so i should check size of libs in java to know that native library is correct. So each changed native library should has diffrent size in different app versions. This function will increase lib size for few bytes :)"); - printf("bla blablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablabla"); + printf("bla blablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablabla blablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablabla blablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablabla blablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablablabla"); } diff --git a/TMessagesProj/jni/image.c b/TMessagesProj/jni/image.c index 57a7f0dea..742bad89d 100644 --- a/TMessagesProj/jni/image.c +++ b/TMessagesProj/jni/image.c @@ -18,11 +18,11 @@ static void fastBlur(int imageWidth, int imageHeight, int imageStride, void *pix const int r1 = radius + 1; const int div = radius * 2 + 1; - if (radius > 15 || div >= w || div >= h) { + if (radius > 15 || div >= w || div >= h || w * h > 90 * 90 || imageStride > imageWidth * 4) { return; } - uint64_t rgb[imageStride * imageHeight]; + uint64_t *rgb = malloc(imageWidth * imageHeight * sizeof(uint64_t)); int x, y, i; @@ -95,6 +95,8 @@ static void fastBlur(int imageWidth, int imageHeight, int imageStride, void *pix } #undef update } + + free(rgb); } typedef struct my_error_mgr { @@ -109,14 +111,18 @@ METHODDEF(void) my_error_exit(j_common_ptr cinfo) { longjmp(myerr->setjmp_buffer, 1); } -JNIEXPORT void Java_org_telegram_messenger_Utilities_blurBitmap(JNIEnv *env, jclass class, jobject bitmap, int width, int height, int stride) { +JNIEXPORT void Java_org_telegram_messenger_Utilities_blurBitmap(JNIEnv *env, jclass class, jobject bitmap) { + if (!bitmap) { + return; + } + AndroidBitmapInfo info; if (AndroidBitmap_getInfo(env, bitmap, &info) < 0) { return; } - if (info.format != ANDROID_BITMAP_FORMAT_RGBA_8888) { + if (info.format != ANDROID_BITMAP_FORMAT_RGBA_8888 || !info.width || !info.height || !info.stride) { return; } @@ -124,7 +130,7 @@ JNIEXPORT void Java_org_telegram_messenger_Utilities_blurBitmap(JNIEnv *env, jcl if (AndroidBitmap_lockPixels(env, bitmap, &pixels) < 0) { return; } - fastBlur(width, height, stride, pixels); + fastBlur(info.width, info.height, info.stride, pixels); AndroidBitmap_unlockPixels(env, bitmap); } diff --git a/TMessagesProj/jni/libyuv/include/libyuv.h b/TMessagesProj/jni/libyuv/include/libyuv.h new file mode 100644 index 000000000..3bebe642c --- /dev/null +++ b/TMessagesProj/jni/libyuv/include/libyuv.h @@ -0,0 +1,33 @@ +/* + * Copyright 2011 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef INCLUDE_LIBYUV_H_ // NOLINT +#define INCLUDE_LIBYUV_H_ + +#include "libyuv/basic_types.h" +#include "libyuv/compare.h" +#include "libyuv/convert.h" +#include "libyuv/convert_argb.h" +#include "libyuv/convert_from.h" +#include "libyuv/convert_from_argb.h" +#include "libyuv/cpu_id.h" +#include "libyuv/format_conversion.h" +#include "libyuv/mjpeg_decoder.h" +#include "libyuv/planar_functions.h" +#include "libyuv/rotate.h" +#include "libyuv/rotate_argb.h" +#include "libyuv/row.h" +#include "libyuv/scale.h" +#include "libyuv/scale_argb.h" +#include "libyuv/scale_row.h" +#include "libyuv/version.h" +#include "libyuv/video_common.h" + +#endif // INCLUDE_LIBYUV_H_ NOLINT diff --git a/TMessagesProj/jni/libyuv/include/libyuv/basic_types.h b/TMessagesProj/jni/libyuv/include/libyuv/basic_types.h new file mode 100644 index 000000000..beb750ba6 --- /dev/null +++ b/TMessagesProj/jni/libyuv/include/libyuv/basic_types.h @@ -0,0 +1,118 @@ +/* + * Copyright 2011 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef INCLUDE_LIBYUV_BASIC_TYPES_H_ // NOLINT +#define INCLUDE_LIBYUV_BASIC_TYPES_H_ + +#include // for NULL, size_t + +#if defined(__ANDROID__) || (defined(_MSC_VER) && (_MSC_VER < 1600)) +#include // for uintptr_t on x86 +#else +#include // for uintptr_t +#endif + +#ifndef GG_LONGLONG +#ifndef INT_TYPES_DEFINED +#define INT_TYPES_DEFINED +#ifdef COMPILER_MSVC +typedef unsigned __int64 uint64; +typedef __int64 int64; +#ifndef INT64_C +#define INT64_C(x) x ## I64 +#endif +#ifndef UINT64_C +#define UINT64_C(x) x ## UI64 +#endif +#define INT64_F "I64" +#else // COMPILER_MSVC +#if defined(__LP64__) && !defined(__OpenBSD__) && !defined(__APPLE__) +typedef unsigned long uint64; // NOLINT +typedef long int64; // NOLINT +#ifndef INT64_C +#define INT64_C(x) x ## L +#endif +#ifndef UINT64_C +#define UINT64_C(x) x ## UL +#endif +#define INT64_F "l" +#else // defined(__LP64__) && !defined(__OpenBSD__) && !defined(__APPLE__) +typedef unsigned long long uint64; // NOLINT +typedef long long int64; // NOLINT +#ifndef INT64_C +#define INT64_C(x) x ## LL +#endif +#ifndef UINT64_C +#define UINT64_C(x) x ## ULL +#endif +#define INT64_F "ll" +#endif // __LP64__ +#endif // COMPILER_MSVC +typedef unsigned int uint32; +typedef int int32; +typedef unsigned short uint16; // NOLINT +typedef short int16; // NOLINT +typedef unsigned char uint8; +typedef signed char int8; +#endif // INT_TYPES_DEFINED +#endif // GG_LONGLONG + +// Detect compiler is for x86 or x64. +#if defined(__x86_64__) || defined(_M_X64) || \ + defined(__i386__) || defined(_M_IX86) +#define CPU_X86 1 +#endif +// Detect compiler is for ARM. +#if defined(__arm__) || defined(_M_ARM) +#define CPU_ARM 1 +#endif + +#ifndef ALIGNP +#ifdef __cplusplus +#define ALIGNP(p, t) \ + (reinterpret_cast(((reinterpret_cast(p) + \ + ((t) - 1)) & ~((t) - 1)))) +#else +#define ALIGNP(p, t) \ + ((uint8*)((((uintptr_t)(p) + ((t) - 1)) & ~((t) - 1)))) /* NOLINT */ +#endif +#endif + +#if !defined(LIBYUV_API) +#if defined(_WIN32) || defined(__CYGWIN__) +#if defined(LIBYUV_BUILDING_SHARED_LIBRARY) +#define LIBYUV_API __declspec(dllexport) +#elif defined(LIBYUV_USING_SHARED_LIBRARY) +#define LIBYUV_API __declspec(dllimport) +#else +#define LIBYUV_API +#endif // LIBYUV_BUILDING_SHARED_LIBRARY +#elif defined(__GNUC__) && (__GNUC__ >= 4) && !defined(__APPLE__) && \ + (defined(LIBYUV_BUILDING_SHARED_LIBRARY) || \ + defined(LIBYUV_USING_SHARED_LIBRARY)) +#define LIBYUV_API __attribute__ ((visibility ("default"))) +#else +#define LIBYUV_API +#endif // __GNUC__ +#endif // LIBYUV_API + +#define LIBYUV_BOOL int +#define LIBYUV_FALSE 0 +#define LIBYUV_TRUE 1 + +// Visual C x86 or GCC little endian. +#if defined(__x86_64__) || defined(_M_X64) || \ + defined(__i386__) || defined(_M_IX86) || \ + defined(__arm__) || defined(_M_ARM) || \ + (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) +#define LIBYUV_LITTLE_ENDIAN +#endif + +#endif // INCLUDE_LIBYUV_BASIC_TYPES_H_ NOLINT diff --git a/TMessagesProj/jni/libyuv/include/libyuv/compare.h b/TMessagesProj/jni/libyuv/include/libyuv/compare.h new file mode 100644 index 000000000..5dfac7c86 --- /dev/null +++ b/TMessagesProj/jni/libyuv/include/libyuv/compare.h @@ -0,0 +1,73 @@ +/* + * Copyright 2011 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef INCLUDE_LIBYUV_COMPARE_H_ // NOLINT +#define INCLUDE_LIBYUV_COMPARE_H_ + +#include "libyuv/basic_types.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// Compute a hash for specified memory. Seed of 5381 recommended. +LIBYUV_API +uint32 HashDjb2(const uint8* src, uint64 count, uint32 seed); + +// Sum Square Error - used to compute Mean Square Error or PSNR. +LIBYUV_API +uint64 ComputeSumSquareError(const uint8* src_a, + const uint8* src_b, int count); + +LIBYUV_API +uint64 ComputeSumSquareErrorPlane(const uint8* src_a, int stride_a, + const uint8* src_b, int stride_b, + int width, int height); + +static const int kMaxPsnr = 128; + +LIBYUV_API +double SumSquareErrorToPsnr(uint64 sse, uint64 count); + +LIBYUV_API +double CalcFramePsnr(const uint8* src_a, int stride_a, + const uint8* src_b, int stride_b, + int width, int height); + +LIBYUV_API +double I420Psnr(const uint8* src_y_a, int stride_y_a, + const uint8* src_u_a, int stride_u_a, + const uint8* src_v_a, int stride_v_a, + const uint8* src_y_b, int stride_y_b, + const uint8* src_u_b, int stride_u_b, + const uint8* src_v_b, int stride_v_b, + int width, int height); + +LIBYUV_API +double CalcFrameSsim(const uint8* src_a, int stride_a, + const uint8* src_b, int stride_b, + int width, int height); + +LIBYUV_API +double I420Ssim(const uint8* src_y_a, int stride_y_a, + const uint8* src_u_a, int stride_u_a, + const uint8* src_v_a, int stride_v_a, + const uint8* src_y_b, int stride_y_b, + const uint8* src_u_b, int stride_u_b, + const uint8* src_v_b, int stride_v_b, + int width, int height); + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif + +#endif // INCLUDE_LIBYUV_COMPARE_H_ NOLINT diff --git a/TMessagesProj/jni/libyuv/include/libyuv/convert.h b/TMessagesProj/jni/libyuv/include/libyuv/convert.h new file mode 100644 index 000000000..1bd45c837 --- /dev/null +++ b/TMessagesProj/jni/libyuv/include/libyuv/convert.h @@ -0,0 +1,254 @@ +/* + * Copyright 2011 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef INCLUDE_LIBYUV_CONVERT_H_ // NOLINT +#define INCLUDE_LIBYUV_CONVERT_H_ + +#include "libyuv/basic_types.h" +// TODO(fbarchard): Remove the following headers includes. +#include "libyuv/convert_from.h" +#include "libyuv/planar_functions.h" +#include "libyuv/rotate.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// Convert I444 to I420. +LIBYUV_API +int I444ToI420(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height); + +// Convert I422 to I420. +LIBYUV_API +int I422ToI420(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height); + +// Convert I411 to I420. +LIBYUV_API +int I411ToI420(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height); + +// Copy I420 to I420. +#define I420ToI420 I420Copy +LIBYUV_API +int I420Copy(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height); + +// Convert I400 (grey) to I420. +LIBYUV_API +int I400ToI420(const uint8* src_y, int src_stride_y, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height); + +// Convert NV12 to I420. +LIBYUV_API +int NV12ToI420(const uint8* src_y, int src_stride_y, + const uint8* src_uv, int src_stride_uv, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height); + +// Convert NV21 to I420. +LIBYUV_API +int NV21ToI420(const uint8* src_y, int src_stride_y, + const uint8* src_vu, int src_stride_vu, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height); + +// Convert YUY2 to I420. +LIBYUV_API +int YUY2ToI420(const uint8* src_yuy2, int src_stride_yuy2, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height); + +// Convert UYVY to I420. +LIBYUV_API +int UYVYToI420(const uint8* src_uyvy, int src_stride_uyvy, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height); + +// Convert M420 to I420. +LIBYUV_API +int M420ToI420(const uint8* src_m420, int src_stride_m420, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height); + +// Convert Q420 to I420. +LIBYUV_API +int Q420ToI420(const uint8* src_y, int src_stride_y, + const uint8* src_yuy2, int src_stride_yuy2, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height); + +// ARGB little endian (bgra in memory) to I420. +LIBYUV_API +int ARGBToI420(const uint8* src_frame, int src_stride_frame, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height); + +// BGRA little endian (argb in memory) to I420. +LIBYUV_API +int BGRAToI420(const uint8* src_frame, int src_stride_frame, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height); + +// ABGR little endian (rgba in memory) to I420. +LIBYUV_API +int ABGRToI420(const uint8* src_frame, int src_stride_frame, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height); + +// RGBA little endian (abgr in memory) to I420. +LIBYUV_API +int RGBAToI420(const uint8* src_frame, int src_stride_frame, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height); + +// RGB little endian (bgr in memory) to I420. +LIBYUV_API +int RGB24ToI420(const uint8* src_frame, int src_stride_frame, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height); + +// RGB big endian (rgb in memory) to I420. +LIBYUV_API +int RAWToI420(const uint8* src_frame, int src_stride_frame, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height); + +// RGB16 (RGBP fourcc) little endian to I420. +LIBYUV_API +int RGB565ToI420(const uint8* src_frame, int src_stride_frame, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height); + +// RGB15 (RGBO fourcc) little endian to I420. +LIBYUV_API +int ARGB1555ToI420(const uint8* src_frame, int src_stride_frame, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height); + +// RGB12 (R444 fourcc) little endian to I420. +LIBYUV_API +int ARGB4444ToI420(const uint8* src_frame, int src_stride_frame, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height); + +#ifdef HAVE_JPEG +// src_width/height provided by capture. +// dst_width/height for clipping determine final size. +LIBYUV_API +int MJPGToI420(const uint8* sample, size_t sample_size, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int src_width, int src_height, + int dst_width, int dst_height); + +// Query size of MJPG in pixels. +LIBYUV_API +int MJPGSize(const uint8* sample, size_t sample_size, + int* width, int* height); +#endif + +// Note Bayer formats (BGGR) To I420 are in format_conversion.h + +// Convert camera sample to I420 with cropping, rotation and vertical flip. +// "src_size" is needed to parse MJPG. +// "dst_stride_y" number of bytes in a row of the dst_y plane. +// Normally this would be the same as dst_width, with recommended alignment +// to 16 bytes for better efficiency. +// If rotation of 90 or 270 is used, stride is affected. The caller should +// allocate the I420 buffer according to rotation. +// "dst_stride_u" number of bytes in a row of the dst_u plane. +// Normally this would be the same as (dst_width + 1) / 2, with +// recommended alignment to 16 bytes for better efficiency. +// If rotation of 90 or 270 is used, stride is affected. +// "crop_x" and "crop_y" are starting position for cropping. +// To center, crop_x = (src_width - dst_width) / 2 +// crop_y = (src_height - dst_height) / 2 +// "src_width" / "src_height" is size of src_frame in pixels. +// "src_height" can be negative indicating a vertically flipped image source. +// "crop_width" / "crop_height" is the size to crop the src to. +// Must be less than or equal to src_width/src_height +// Cropping parameters are pre-rotation. +// "rotation" can be 0, 90, 180 or 270. +// "format" is a fourcc. ie 'I420', 'YUY2' +// Returns 0 for successful; -1 for invalid parameter. Non-zero for failure. +LIBYUV_API +int ConvertToI420(const uint8* src_frame, size_t src_size, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int crop_x, int crop_y, + int src_width, int src_height, + int crop_width, int crop_height, + enum RotationMode rotation, + uint32 format); + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif + +#endif // INCLUDE_LIBYUV_CONVERT_H_ NOLINT diff --git a/TMessagesProj/jni/libyuv/include/libyuv/convert_argb.h b/TMessagesProj/jni/libyuv/include/libyuv/convert_argb.h new file mode 100644 index 000000000..a18014ca2 --- /dev/null +++ b/TMessagesProj/jni/libyuv/include/libyuv/convert_argb.h @@ -0,0 +1,225 @@ +/* + * Copyright 2012 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef INCLUDE_LIBYUV_CONVERT_ARGB_H_ // NOLINT +#define INCLUDE_LIBYUV_CONVERT_ARGB_H_ + +#include "libyuv/basic_types.h" +// TODO(fbarchard): Remove the following headers includes +#include "libyuv/convert_from.h" +#include "libyuv/planar_functions.h" +#include "libyuv/rotate.h" + +// TODO(fbarchard): This set of functions should exactly match convert.h +// Add missing Q420. +// TODO(fbarchard): Add tests. Create random content of right size and convert +// with C vs Opt and or to I420 and compare. +// TODO(fbarchard): Some of these functions lack parameter setting. + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// Alias. +#define ARGBToARGB ARGBCopy + +// Copy ARGB to ARGB. +LIBYUV_API +int ARGBCopy(const uint8* src_argb, int src_stride_argb, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +// Convert I420 to ARGB. +LIBYUV_API +int I420ToARGB(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +// Convert I422 to ARGB. +LIBYUV_API +int I422ToARGB(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +// Convert I444 to ARGB. +LIBYUV_API +int I444ToARGB(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +// Convert I411 to ARGB. +LIBYUV_API +int I411ToARGB(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +// Convert I400 (grey) to ARGB. +LIBYUV_API +int I400ToARGB(const uint8* src_y, int src_stride_y, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +// Alias. +#define YToARGB I400ToARGB_Reference + +// Convert I400 to ARGB. Reverse of ARGBToI400. +LIBYUV_API +int I400ToARGB_Reference(const uint8* src_y, int src_stride_y, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +// Convert NV12 to ARGB. +LIBYUV_API +int NV12ToARGB(const uint8* src_y, int src_stride_y, + const uint8* src_uv, int src_stride_uv, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +// Convert NV21 to ARGB. +LIBYUV_API +int NV21ToARGB(const uint8* src_y, int src_stride_y, + const uint8* src_vu, int src_stride_vu, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +// Convert M420 to ARGB. +LIBYUV_API +int M420ToARGB(const uint8* src_m420, int src_stride_m420, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +// TODO(fbarchard): Convert Q420 to ARGB. +// LIBYUV_API +// int Q420ToARGB(const uint8* src_y, int src_stride_y, +// const uint8* src_yuy2, int src_stride_yuy2, +// uint8* dst_argb, int dst_stride_argb, +// int width, int height); + +// Convert YUY2 to ARGB. +LIBYUV_API +int YUY2ToARGB(const uint8* src_yuy2, int src_stride_yuy2, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +// Convert UYVY to ARGB. +LIBYUV_API +int UYVYToARGB(const uint8* src_uyvy, int src_stride_uyvy, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +// BGRA little endian (argb in memory) to ARGB. +LIBYUV_API +int BGRAToARGB(const uint8* src_frame, int src_stride_frame, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +// ABGR little endian (rgba in memory) to ARGB. +LIBYUV_API +int ABGRToARGB(const uint8* src_frame, int src_stride_frame, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +// RGBA little endian (abgr in memory) to ARGB. +LIBYUV_API +int RGBAToARGB(const uint8* src_frame, int src_stride_frame, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +// Deprecated function name. +#define BG24ToARGB RGB24ToARGB + +// RGB little endian (bgr in memory) to ARGB. +LIBYUV_API +int RGB24ToARGB(const uint8* src_frame, int src_stride_frame, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +// RGB big endian (rgb in memory) to ARGB. +LIBYUV_API +int RAWToARGB(const uint8* src_frame, int src_stride_frame, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +// RGB16 (RGBP fourcc) little endian to ARGB. +LIBYUV_API +int RGB565ToARGB(const uint8* src_frame, int src_stride_frame, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +// RGB15 (RGBO fourcc) little endian to ARGB. +LIBYUV_API +int ARGB1555ToARGB(const uint8* src_frame, int src_stride_frame, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +// RGB12 (R444 fourcc) little endian to ARGB. +LIBYUV_API +int ARGB4444ToARGB(const uint8* src_frame, int src_stride_frame, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +#ifdef HAVE_JPEG +// src_width/height provided by capture +// dst_width/height for clipping determine final size. +LIBYUV_API +int MJPGToARGB(const uint8* sample, size_t sample_size, + uint8* dst_argb, int dst_stride_argb, + int src_width, int src_height, + int dst_width, int dst_height); +#endif + +// Note Bayer formats (BGGR) to ARGB are in format_conversion.h. + +// Convert camera sample to ARGB with cropping, rotation and vertical flip. +// "src_size" is needed to parse MJPG. +// "dst_stride_argb" number of bytes in a row of the dst_argb plane. +// Normally this would be the same as dst_width, with recommended alignment +// to 16 bytes for better efficiency. +// If rotation of 90 or 270 is used, stride is affected. The caller should +// allocate the I420 buffer according to rotation. +// "dst_stride_u" number of bytes in a row of the dst_u plane. +// Normally this would be the same as (dst_width + 1) / 2, with +// recommended alignment to 16 bytes for better efficiency. +// If rotation of 90 or 270 is used, stride is affected. +// "crop_x" and "crop_y" are starting position for cropping. +// To center, crop_x = (src_width - dst_width) / 2 +// crop_y = (src_height - dst_height) / 2 +// "src_width" / "src_height" is size of src_frame in pixels. +// "src_height" can be negative indicating a vertically flipped image source. +// "crop_width" / "crop_height" is the size to crop the src to. +// Must be less than or equal to src_width/src_height +// Cropping parameters are pre-rotation. +// "rotation" can be 0, 90, 180 or 270. +// "format" is a fourcc. ie 'I420', 'YUY2' +// Returns 0 for successful; -1 for invalid parameter. Non-zero for failure. +LIBYUV_API +int ConvertToARGB(const uint8* src_frame, size_t src_size, + uint8* dst_argb, int dst_stride_argb, + int crop_x, int crop_y, + int src_width, int src_height, + int crop_width, int crop_height, + enum RotationMode rotation, + uint32 format); + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif + +#endif // INCLUDE_LIBYUV_CONVERT_ARGB_H_ NOLINT diff --git a/TMessagesProj/jni/libyuv/include/libyuv/convert_from.h b/TMessagesProj/jni/libyuv/include/libyuv/convert_from.h new file mode 100644 index 000000000..b1cf57f7d --- /dev/null +++ b/TMessagesProj/jni/libyuv/include/libyuv/convert_from.h @@ -0,0 +1,173 @@ +/* + * Copyright 2011 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef INCLUDE_LIBYUV_CONVERT_FROM_H_ // NOLINT +#define INCLUDE_LIBYUV_CONVERT_FROM_H_ + +#include "libyuv/basic_types.h" +#include "libyuv/rotate.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// See Also convert.h for conversions from formats to I420. + +// I420Copy in convert to I420ToI420. + +LIBYUV_API +int I420ToI422(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height); + +LIBYUV_API +int I420ToI444(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height); + +LIBYUV_API +int I420ToI411(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height); + +// Copy to I400. Source can be I420, I422, I444, I400, NV12 or NV21. +LIBYUV_API +int I400Copy(const uint8* src_y, int src_stride_y, + uint8* dst_y, int dst_stride_y, + int width, int height); + +// TODO(fbarchard): I420ToM420 +// TODO(fbarchard): I420ToQ420 + +LIBYUV_API +int I420ToNV12(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_y, int dst_stride_y, + uint8* dst_uv, int dst_stride_uv, + int width, int height); + +LIBYUV_API +int I420ToNV21(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_y, int dst_stride_y, + uint8* dst_vu, int dst_stride_vu, + int width, int height); + +LIBYUV_API +int I420ToYUY2(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_frame, int dst_stride_frame, + int width, int height); + +LIBYUV_API +int I420ToUYVY(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_frame, int dst_stride_frame, + int width, int height); + +LIBYUV_API +int I420ToARGB(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +LIBYUV_API +int I420ToBGRA(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +LIBYUV_API +int I420ToABGR(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +LIBYUV_API +int I420ToRGBA(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_rgba, int dst_stride_rgba, + int width, int height); + +LIBYUV_API +int I420ToRGB24(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_frame, int dst_stride_frame, + int width, int height); + +LIBYUV_API +int I420ToRAW(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_frame, int dst_stride_frame, + int width, int height); + +LIBYUV_API +int I420ToRGB565(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_frame, int dst_stride_frame, + int width, int height); + +LIBYUV_API +int I420ToARGB1555(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_frame, int dst_stride_frame, + int width, int height); + +LIBYUV_API +int I420ToARGB4444(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_frame, int dst_stride_frame, + int width, int height); + +// Note Bayer formats (BGGR) To I420 are in format_conversion.h. + +// Convert I420 to specified format. +// "dst_sample_stride" is bytes in a row for the destination. Pass 0 if the +// buffer has contiguous rows. Can be negative. A multiple of 16 is optimal. +LIBYUV_API +int ConvertFromI420(const uint8* y, int y_stride, + const uint8* u, int u_stride, + const uint8* v, int v_stride, + uint8* dst_sample, int dst_sample_stride, + int width, int height, + uint32 format); + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif + +#endif // INCLUDE_LIBYUV_CONVERT_FROM_H_ NOLINT diff --git a/TMessagesProj/jni/libyuv/include/libyuv/convert_from_argb.h b/TMessagesProj/jni/libyuv/include/libyuv/convert_from_argb.h new file mode 100644 index 000000000..90f43af04 --- /dev/null +++ b/TMessagesProj/jni/libyuv/include/libyuv/convert_from_argb.h @@ -0,0 +1,166 @@ +/* + * Copyright 2012 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef INCLUDE_LIBYUV_CONVERT_FROM_ARGB_H_ // NOLINT +#define INCLUDE_LIBYUV_CONVERT_FROM_ARGB_H_ + +#include "libyuv/basic_types.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// Copy ARGB to ARGB. +#define ARGBToARGB ARGBCopy +LIBYUV_API +int ARGBCopy(const uint8* src_argb, int src_stride_argb, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +// Convert ARGB To BGRA. +LIBYUV_API +int ARGBToBGRA(const uint8* src_argb, int src_stride_argb, + uint8* dst_bgra, int dst_stride_bgra, + int width, int height); + +// Convert ARGB To ABGR. +LIBYUV_API +int ARGBToABGR(const uint8* src_argb, int src_stride_argb, + uint8* dst_abgr, int dst_stride_abgr, + int width, int height); + +// Convert ARGB To RGBA. +LIBYUV_API +int ARGBToRGBA(const uint8* src_argb, int src_stride_argb, + uint8* dst_rgba, int dst_stride_rgba, + int width, int height); + +// Convert ARGB To RGB24. +LIBYUV_API +int ARGBToRGB24(const uint8* src_argb, int src_stride_argb, + uint8* dst_rgb24, int dst_stride_rgb24, + int width, int height); + +// Convert ARGB To RAW. +LIBYUV_API +int ARGBToRAW(const uint8* src_argb, int src_stride_argb, + uint8* dst_rgb, int dst_stride_rgb, + int width, int height); + +// Convert ARGB To RGB565. +LIBYUV_API +int ARGBToRGB565(const uint8* src_argb, int src_stride_argb, + uint8* dst_rgb565, int dst_stride_rgb565, + int width, int height); + +// Convert ARGB To ARGB1555. +LIBYUV_API +int ARGBToARGB1555(const uint8* src_argb, int src_stride_argb, + uint8* dst_argb1555, int dst_stride_argb1555, + int width, int height); + +// Convert ARGB To ARGB4444. +LIBYUV_API +int ARGBToARGB4444(const uint8* src_argb, int src_stride_argb, + uint8* dst_argb4444, int dst_stride_argb4444, + int width, int height); + +// Convert ARGB To I444. +LIBYUV_API +int ARGBToI444(const uint8* src_argb, int src_stride_argb, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height); + +// Convert ARGB To I422. +LIBYUV_API +int ARGBToI422(const uint8* src_argb, int src_stride_argb, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height); + +// Convert ARGB To I420. (also in convert.h) +LIBYUV_API +int ARGBToI420(const uint8* src_argb, int src_stride_argb, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height); + +// Convert ARGB to J420. (JPeg full range I420). +LIBYUV_API +int ARGBToJ420(const uint8* src_argb, int src_stride_argb, + uint8* dst_yj, int dst_stride_yj, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height); + +// Convert ARGB To I411. +LIBYUV_API +int ARGBToI411(const uint8* src_argb, int src_stride_argb, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height); + +// Convert ARGB to J400. (JPeg full range). +LIBYUV_API +int ARGBToJ400(const uint8* src_argb, int src_stride_argb, + uint8* dst_yj, int dst_stride_yj, + int width, int height); + +// Convert ARGB to I400. +LIBYUV_API +int ARGBToI400(const uint8* src_argb, int src_stride_argb, + uint8* dst_y, int dst_stride_y, + int width, int height); + +// Convert ARGB To NV12. +LIBYUV_API +int ARGBToNV12(const uint8* src_argb, int src_stride_argb, + uint8* dst_y, int dst_stride_y, + uint8* dst_uv, int dst_stride_uv, + int width, int height); + +// Convert ARGB To NV21. +LIBYUV_API +int ARGBToNV21(const uint8* src_argb, int src_stride_argb, + uint8* dst_y, int dst_stride_y, + uint8* dst_vu, int dst_stride_vu, + int width, int height); + +// Convert ARGB To NV21. +LIBYUV_API +int ARGBToNV21(const uint8* src_argb, int src_stride_argb, + uint8* dst_y, int dst_stride_y, + uint8* dst_vu, int dst_stride_vu, + int width, int height); + +// Convert ARGB To YUY2. +LIBYUV_API +int ARGBToYUY2(const uint8* src_argb, int src_stride_argb, + uint8* dst_yuy2, int dst_stride_yuy2, + int width, int height); + +// Convert ARGB To UYVY. +LIBYUV_API +int ARGBToUYVY(const uint8* src_argb, int src_stride_argb, + uint8* dst_uyvy, int dst_stride_uyvy, + int width, int height); + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif + +#endif // INCLUDE_LIBYUV_CONVERT_FROM_ARGB_H_ NOLINT diff --git a/TMessagesProj/jni/libyuv/include/libyuv/cpu_id.h b/TMessagesProj/jni/libyuv/include/libyuv/cpu_id.h new file mode 100644 index 000000000..dc858a814 --- /dev/null +++ b/TMessagesProj/jni/libyuv/include/libyuv/cpu_id.h @@ -0,0 +1,81 @@ +/* + * Copyright 2011 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef INCLUDE_LIBYUV_CPU_ID_H_ // NOLINT +#define INCLUDE_LIBYUV_CPU_ID_H_ + +#include "libyuv/basic_types.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// TODO(fbarchard): Consider overlapping bits for different architectures. +// Internal flag to indicate cpuid requires initialization. +#define kCpuInit 0x1 + +// These flags are only valid on ARM processors. +static const int kCpuHasARM = 0x2; +static const int kCpuHasNEON = 0x4; +// 0x8 reserved for future ARM flag. + +// These flags are only valid on x86 processors. +static const int kCpuHasX86 = 0x10; +static const int kCpuHasSSE2 = 0x20; +static const int kCpuHasSSSE3 = 0x40; +static const int kCpuHasSSE41 = 0x80; +static const int kCpuHasSSE42 = 0x100; +static const int kCpuHasAVX = 0x200; +static const int kCpuHasAVX2 = 0x400; +static const int kCpuHasERMS = 0x800; +static const int kCpuHasFMA3 = 0x1000; +// 0x2000, 0x4000, 0x8000 reserved for future X86 flags. + +// These flags are only valid on MIPS processors. +static const int kCpuHasMIPS = 0x10000; +static const int kCpuHasMIPS_DSP = 0x20000; +static const int kCpuHasMIPS_DSPR2 = 0x40000; + +// Internal function used to auto-init. +LIBYUV_API +int InitCpuFlags(void); + +// Internal function for parsing /proc/cpuinfo. +LIBYUV_API +int ArmCpuCaps(const char* cpuinfo_name); + +// Detect CPU has SSE2 etc. +// Test_flag parameter should be one of kCpuHas constants above. +// returns non-zero if instruction set is detected +static __inline int TestCpuFlag(int test_flag) { + LIBYUV_API extern int cpu_info_; + return (cpu_info_ == kCpuInit ? InitCpuFlags() : cpu_info_) & test_flag; +} + +// For testing, allow CPU flags to be disabled. +// ie MaskCpuFlags(~kCpuHasSSSE3) to disable SSSE3. +// MaskCpuFlags(-1) to enable all cpu specific optimizations. +// MaskCpuFlags(0) to disable all cpu specific optimizations. +LIBYUV_API +void MaskCpuFlags(int enable_flags); + +// Low level cpuid for X86. Returns zeros on other CPUs. +// eax is the info type that you want. +// ecx is typically the cpu number, and should normally be zero. +LIBYUV_API +void CpuId(uint32 eax, uint32 ecx, uint32* cpu_info); + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif + +#endif // INCLUDE_LIBYUV_CPU_ID_H_ NOLINT diff --git a/TMessagesProj/jni/libyuv/include/libyuv/format_conversion.h b/TMessagesProj/jni/libyuv/include/libyuv/format_conversion.h new file mode 100644 index 000000000..b18bf0534 --- /dev/null +++ b/TMessagesProj/jni/libyuv/include/libyuv/format_conversion.h @@ -0,0 +1,168 @@ +/* + * Copyright 2011 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef INCLUDE_LIBYUV_FORMATCONVERSION_H_ // NOLINT +#define INCLUDE_LIBYUV_FORMATCONVERSION_H_ + +#include "libyuv/basic_types.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// Convert Bayer RGB formats to I420. +LIBYUV_API +int BayerBGGRToI420(const uint8* src_bayer, int src_stride_bayer, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height); + +LIBYUV_API +int BayerGBRGToI420(const uint8* src_bayer, int src_stride_bayer, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height); + +LIBYUV_API +int BayerGRBGToI420(const uint8* src_bayer, int src_stride_bayer, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height); + +LIBYUV_API +int BayerRGGBToI420(const uint8* src_bayer, int src_stride_bayer, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height); + +// Temporary API mapper. +#define BayerRGBToI420(b, bs, f, y, ys, u, us, v, vs, w, h) \ + BayerToI420(b, bs, y, ys, u, us, v, vs, w, h, f) + +LIBYUV_API +int BayerToI420(const uint8* src_bayer, int src_stride_bayer, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height, + uint32 src_fourcc_bayer); + +// Convert I420 to Bayer RGB formats. +LIBYUV_API +int I420ToBayerBGGR(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_frame, int dst_stride_frame, + int width, int height); + +LIBYUV_API +int I420ToBayerGBRG(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_frame, int dst_stride_frame, + int width, int height); + +LIBYUV_API +int I420ToBayerGRBG(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_frame, int dst_stride_frame, + int width, int height); + +LIBYUV_API +int I420ToBayerRGGB(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_frame, int dst_stride_frame, + int width, int height); + +// Temporary API mapper. +#define I420ToBayerRGB(y, ys, u, us, v, vs, b, bs, f, w, h) \ + I420ToBayer(y, ys, u, us, v, vs, b, bs, w, h, f) + +LIBYUV_API +int I420ToBayer(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_frame, int dst_stride_frame, + int width, int height, + uint32 dst_fourcc_bayer); + +// Convert Bayer RGB formats to ARGB. +LIBYUV_API +int BayerBGGRToARGB(const uint8* src_bayer, int src_stride_bayer, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +LIBYUV_API +int BayerGBRGToARGB(const uint8* src_bayer, int src_stride_bayer, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +LIBYUV_API +int BayerGRBGToARGB(const uint8* src_bayer, int src_stride_bayer, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +LIBYUV_API +int BayerRGGBToARGB(const uint8* src_bayer, int src_stride_bayer, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +// Temporary API mapper. +#define BayerRGBToARGB(b, bs, f, a, as, w, h) BayerToARGB(b, bs, a, as, w, h, f) + +LIBYUV_API +int BayerToARGB(const uint8* src_bayer, int src_stride_bayer, + uint8* dst_argb, int dst_stride_argb, + int width, int height, + uint32 src_fourcc_bayer); + +// Converts ARGB to Bayer RGB formats. +LIBYUV_API +int ARGBToBayerBGGR(const uint8* src_argb, int src_stride_argb, + uint8* dst_bayer, int dst_stride_bayer, + int width, int height); + +LIBYUV_API +int ARGBToBayerGBRG(const uint8* src_argb, int src_stride_argb, + uint8* dst_bayer, int dst_stride_bayer, + int width, int height); + +LIBYUV_API +int ARGBToBayerGRBG(const uint8* src_argb, int src_stride_argb, + uint8* dst_bayer, int dst_stride_bayer, + int width, int height); + +LIBYUV_API +int ARGBToBayerRGGB(const uint8* src_argb, int src_stride_argb, + uint8* dst_bayer, int dst_stride_bayer, + int width, int height); + +// Temporary API mapper. +#define ARGBToBayerRGB(a, as, b, bs, f, w, h) ARGBToBayer(b, bs, a, as, w, h, f) + +LIBYUV_API +int ARGBToBayer(const uint8* src_argb, int src_stride_argb, + uint8* dst_bayer, int dst_stride_bayer, + int width, int height, + uint32 dst_fourcc_bayer); + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif + +#endif // INCLUDE_LIBYUV_FORMATCONVERSION_H_ NOLINT diff --git a/TMessagesProj/jni/libyuv/include/libyuv/mjpeg_decoder.h b/TMessagesProj/jni/libyuv/include/libyuv/mjpeg_decoder.h new file mode 100644 index 000000000..8423121d1 --- /dev/null +++ b/TMessagesProj/jni/libyuv/include/libyuv/mjpeg_decoder.h @@ -0,0 +1,192 @@ +/* + * Copyright 2012 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef INCLUDE_LIBYUV_MJPEG_DECODER_H_ // NOLINT +#define INCLUDE_LIBYUV_MJPEG_DECODER_H_ + +#include "libyuv/basic_types.h" + +#ifdef __cplusplus +// NOTE: For a simplified public API use convert.h MJPGToI420(). + +struct jpeg_common_struct; +struct jpeg_decompress_struct; +struct jpeg_source_mgr; + +namespace libyuv { + +#ifdef __cplusplus +extern "C" { +#endif + +LIBYUV_BOOL ValidateJpeg(const uint8* sample, size_t sample_size); + +#ifdef __cplusplus +} // extern "C" +#endif + +static const uint32 kUnknownDataSize = 0xFFFFFFFF; + +enum JpegSubsamplingType { + kJpegYuv420, + kJpegYuv422, + kJpegYuv411, + kJpegYuv444, + kJpegYuv400, + kJpegUnknown +}; + +struct Buffer { + const uint8* data; + int len; +}; + +struct BufferVector { + Buffer* buffers; + int len; + int pos; +}; + +struct SetJmpErrorMgr; + +// MJPEG ("Motion JPEG") is a pseudo-standard video codec where the frames are +// simply independent JPEG images with a fixed huffman table (which is omitted). +// It is rarely used in video transmission, but is common as a camera capture +// format, especially in Logitech devices. This class implements a decoder for +// MJPEG frames. +// +// See http://tools.ietf.org/html/rfc2435 +class LIBYUV_API MJpegDecoder { + public: + typedef void (*CallbackFunction)(void* opaque, + const uint8* const* data, + const int* strides, + int rows); + + static const int kColorSpaceUnknown; + static const int kColorSpaceGrayscale; + static const int kColorSpaceRgb; + static const int kColorSpaceYCbCr; + static const int kColorSpaceCMYK; + static const int kColorSpaceYCCK; + + MJpegDecoder(); + ~MJpegDecoder(); + + // Loads a new frame, reads its headers, and determines the uncompressed + // image format. + // Returns LIBYUV_TRUE if image looks valid and format is supported. + // If return value is LIBYUV_TRUE, then the values for all the following + // getters are populated. + // src_len is the size of the compressed mjpeg frame in bytes. + LIBYUV_BOOL LoadFrame(const uint8* src, size_t src_len); + + // Returns width of the last loaded frame in pixels. + int GetWidth(); + + // Returns height of the last loaded frame in pixels. + int GetHeight(); + + // Returns format of the last loaded frame. The return value is one of the + // kColorSpace* constants. + int GetColorSpace(); + + // Number of color components in the color space. + int GetNumComponents(); + + // Sample factors of the n-th component. + int GetHorizSampFactor(int component); + + int GetVertSampFactor(int component); + + int GetHorizSubSampFactor(int component); + + int GetVertSubSampFactor(int component); + + // Public for testability. + int GetImageScanlinesPerImcuRow(); + + // Public for testability. + int GetComponentScanlinesPerImcuRow(int component); + + // Width of a component in bytes. + int GetComponentWidth(int component); + + // Height of a component. + int GetComponentHeight(int component); + + // Width of a component in bytes with padding for DCTSIZE. Public for testing. + int GetComponentStride(int component); + + // Size of a component in bytes. + int GetComponentSize(int component); + + // Call this after LoadFrame() if you decide you don't want to decode it + // after all. + LIBYUV_BOOL UnloadFrame(); + + // Decodes the entire image into a one-buffer-per-color-component format. + // dst_width must match exactly. dst_height must be <= to image height; if + // less, the image is cropped. "planes" must have size equal to at least + // GetNumComponents() and they must point to non-overlapping buffers of size + // at least GetComponentSize(i). The pointers in planes are incremented + // to point to after the end of the written data. + // TODO(fbarchard): Add dst_x, dst_y to allow specific rect to be decoded. + LIBYUV_BOOL DecodeToBuffers(uint8** planes, int dst_width, int dst_height); + + // Decodes the entire image and passes the data via repeated calls to a + // callback function. Each call will get the data for a whole number of + // image scanlines. + // TODO(fbarchard): Add dst_x, dst_y to allow specific rect to be decoded. + LIBYUV_BOOL DecodeToCallback(CallbackFunction fn, void* opaque, + int dst_width, int dst_height); + + // The helper function which recognizes the jpeg sub-sampling type. + static JpegSubsamplingType JpegSubsamplingTypeHelper( + int* subsample_x, int* subsample_y, int number_of_components); + + private: + void AllocOutputBuffers(int num_outbufs); + void DestroyOutputBuffers(); + + LIBYUV_BOOL StartDecode(); + LIBYUV_BOOL FinishDecode(); + + void SetScanlinePointers(uint8** data); + LIBYUV_BOOL DecodeImcuRow(); + + int GetComponentScanlinePadding(int component); + + // A buffer holding the input data for a frame. + Buffer buf_; + BufferVector buf_vec_; + + jpeg_decompress_struct* decompress_struct_; + jpeg_source_mgr* source_mgr_; + SetJmpErrorMgr* error_mgr_; + + // LIBYUV_TRUE iff at least one component has scanline padding. (i.e., + // GetComponentScanlinePadding() != 0.) + LIBYUV_BOOL has_scanline_padding_; + + // Temporaries used to point to scanline outputs. + int num_outbufs_; // Outermost size of all arrays below. + uint8*** scanlines_; + int* scanlines_sizes_; + // Temporary buffer used for decoding when we can't decode directly to the + // output buffers. Large enough for just one iMCU row. + uint8** databuf_; + int* databuf_strides_; +}; + +} // namespace libyuv + +#endif // __cplusplus +#endif // INCLUDE_LIBYUV_MJPEG_DECODER_H_ NOLINT diff --git a/TMessagesProj/jni/libyuv/include/libyuv/planar_functions.h b/TMessagesProj/jni/libyuv/include/libyuv/planar_functions.h new file mode 100644 index 000000000..d10a16985 --- /dev/null +++ b/TMessagesProj/jni/libyuv/include/libyuv/planar_functions.h @@ -0,0 +1,439 @@ +/* + * Copyright 2011 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef INCLUDE_LIBYUV_PLANAR_FUNCTIONS_H_ // NOLINT +#define INCLUDE_LIBYUV_PLANAR_FUNCTIONS_H_ + +#include "libyuv/basic_types.h" + +// TODO(fbarchard): Remove the following headers includes. +#include "libyuv/convert.h" +#include "libyuv/convert_argb.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// Copy a plane of data. +LIBYUV_API +void CopyPlane(const uint8* src_y, int src_stride_y, + uint8* dst_y, int dst_stride_y, + int width, int height); + +LIBYUV_API +void CopyPlane_16(const uint16* src_y, int src_stride_y, + uint16* dst_y, int dst_stride_y, + int width, int height); + +// Set a plane of data to a 32 bit value. +LIBYUV_API +void SetPlane(uint8* dst_y, int dst_stride_y, + int width, int height, + uint32 value); + +// Copy I400. Supports inverting. +LIBYUV_API +int I400ToI400(const uint8* src_y, int src_stride_y, + uint8* dst_y, int dst_stride_y, + int width, int height); + + +// Copy I422 to I422. +#define I422ToI422 I422Copy +LIBYUV_API +int I422Copy(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height); + +// Copy I444 to I444. +#define I444ToI444 I444Copy +LIBYUV_API +int I444Copy(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height); + +// Convert YUY2 to I422. +LIBYUV_API +int YUY2ToI422(const uint8* src_yuy2, int src_stride_yuy2, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height); + +// Convert UYVY to I422. +LIBYUV_API +int UYVYToI422(const uint8* src_uyvy, int src_stride_uyvy, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height); + +// Convert I420 to I400. (calls CopyPlane ignoring u/v). +LIBYUV_API +int I420ToI400(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_y, int dst_stride_y, + int width, int height); + +// Alias +#define I420ToI420Mirror I420Mirror + +// I420 mirror. +LIBYUV_API +int I420Mirror(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height); + +// Alias +#define I400ToI400Mirror I400Mirror + +// I400 mirror. A single plane is mirrored horizontally. +// Pass negative height to achieve 180 degree rotation. +LIBYUV_API +int I400Mirror(const uint8* src_y, int src_stride_y, + uint8* dst_y, int dst_stride_y, + int width, int height); + +// Alias +#define ARGBToARGBMirror ARGBMirror + +// ARGB mirror. +LIBYUV_API +int ARGBMirror(const uint8* src_argb, int src_stride_argb, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +// Convert NV12 to RGB565. +LIBYUV_API +int NV12ToRGB565(const uint8* src_y, int src_stride_y, + const uint8* src_uv, int src_stride_uv, + uint8* dst_rgb565, int dst_stride_rgb565, + int width, int height); + +// Convert NV21 to RGB565. +LIBYUV_API +int NV21ToRGB565(const uint8* src_y, int src_stride_y, + const uint8* src_uv, int src_stride_uv, + uint8* dst_rgb565, int dst_stride_rgb565, + int width, int height); + +// I422ToARGB is in convert_argb.h +// Convert I422 to BGRA. +LIBYUV_API +int I422ToBGRA(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_bgra, int dst_stride_bgra, + int width, int height); + +// Convert I422 to ABGR. +LIBYUV_API +int I422ToABGR(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_abgr, int dst_stride_abgr, + int width, int height); + +// Convert I422 to RGBA. +LIBYUV_API +int I422ToRGBA(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_rgba, int dst_stride_rgba, + int width, int height); + +// Draw a rectangle into I420. +LIBYUV_API +int I420Rect(uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int x, int y, int width, int height, + int value_y, int value_u, int value_v); + +// Draw a rectangle into ARGB. +LIBYUV_API +int ARGBRect(uint8* dst_argb, int dst_stride_argb, + int x, int y, int width, int height, uint32 value); + +// Convert ARGB to gray scale ARGB. +LIBYUV_API +int ARGBGrayTo(const uint8* src_argb, int src_stride_argb, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +// Make a rectangle of ARGB gray scale. +LIBYUV_API +int ARGBGray(uint8* dst_argb, int dst_stride_argb, + int x, int y, int width, int height); + +// Make a rectangle of ARGB Sepia tone. +LIBYUV_API +int ARGBSepia(uint8* dst_argb, int dst_stride_argb, + int x, int y, int width, int height); + +// Apply a matrix rotation to each ARGB pixel. +// matrix_argb is 4 signed ARGB values. -128 to 127 representing -2 to 2. +// The first 4 coefficients apply to B, G, R, A and produce B of the output. +// The next 4 coefficients apply to B, G, R, A and produce G of the output. +// The next 4 coefficients apply to B, G, R, A and produce R of the output. +// The last 4 coefficients apply to B, G, R, A and produce A of the output. +LIBYUV_API +int ARGBColorMatrix(const uint8* src_argb, int src_stride_argb, + uint8* dst_argb, int dst_stride_argb, + const int8* matrix_argb, + int width, int height); + +// Deprecated. Use ARGBColorMatrix instead. +// Apply a matrix rotation to each ARGB pixel. +// matrix_argb is 3 signed ARGB values. -128 to 127 representing -1 to 1. +// The first 4 coefficients apply to B, G, R, A and produce B of the output. +// The next 4 coefficients apply to B, G, R, A and produce G of the output. +// The last 4 coefficients apply to B, G, R, A and produce R of the output. +LIBYUV_API +int RGBColorMatrix(uint8* dst_argb, int dst_stride_argb, + const int8* matrix_rgb, + int x, int y, int width, int height); + +// Apply a color table each ARGB pixel. +// Table contains 256 ARGB values. +LIBYUV_API +int ARGBColorTable(uint8* dst_argb, int dst_stride_argb, + const uint8* table_argb, + int x, int y, int width, int height); + +// Apply a color table each ARGB pixel but preserve destination alpha. +// Table contains 256 ARGB values. +LIBYUV_API +int RGBColorTable(uint8* dst_argb, int dst_stride_argb, + const uint8* table_argb, + int x, int y, int width, int height); + +// Apply a luma/color table each ARGB pixel but preserve destination alpha. +// Table contains 32768 values indexed by [Y][C] where 7 it 7 bit luma from +// RGB (YJ style) and C is an 8 bit color component (R, G or B). +LIBYUV_API +int ARGBLumaColorTable(const uint8* src_argb, int src_stride_argb, + uint8* dst_argb, int dst_stride_argb, + const uint8* luma_rgb_table, + int width, int height); + +// Apply a 3 term polynomial to ARGB values. +// poly points to a 4x4 matrix. The first row is constants. The 2nd row is +// coefficients for b, g, r and a. The 3rd row is coefficients for b squared, +// g squared, r squared and a squared. The 4rd row is coefficients for b to +// the 3, g to the 3, r to the 3 and a to the 3. The values are summed and +// result clamped to 0 to 255. +// A polynomial approximation can be dirived using software such as 'R'. + +LIBYUV_API +int ARGBPolynomial(const uint8* src_argb, int src_stride_argb, + uint8* dst_argb, int dst_stride_argb, + const float* poly, + int width, int height); + +// Quantize a rectangle of ARGB. Alpha unaffected. +// scale is a 16 bit fractional fixed point scaler between 0 and 65535. +// interval_size should be a value between 1 and 255. +// interval_offset should be a value between 0 and 255. +LIBYUV_API +int ARGBQuantize(uint8* dst_argb, int dst_stride_argb, + int scale, int interval_size, int interval_offset, + int x, int y, int width, int height); + +// Copy ARGB to ARGB. +LIBYUV_API +int ARGBCopy(const uint8* src_argb, int src_stride_argb, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +// Copy ARGB to ARGB. +LIBYUV_API +int ARGBCopyAlpha(const uint8* src_argb, int src_stride_argb, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +// Copy ARGB to ARGB. +LIBYUV_API +int ARGBCopyYToAlpha(const uint8* src_y, int src_stride_y, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +typedef void (*ARGBBlendRow)(const uint8* src_argb0, const uint8* src_argb1, + uint8* dst_argb, int width); + +// Get function to Alpha Blend ARGB pixels and store to destination. +LIBYUV_API +ARGBBlendRow GetARGBBlend(); + +// Alpha Blend ARGB images and store to destination. +// Alpha of destination is set to 255. +LIBYUV_API +int ARGBBlend(const uint8* src_argb0, int src_stride_argb0, + const uint8* src_argb1, int src_stride_argb1, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +// Multiply ARGB image by ARGB image. Shifted down by 8. Saturates to 255. +LIBYUV_API +int ARGBMultiply(const uint8* src_argb0, int src_stride_argb0, + const uint8* src_argb1, int src_stride_argb1, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +// Add ARGB image with ARGB image. Saturates to 255. +LIBYUV_API +int ARGBAdd(const uint8* src_argb0, int src_stride_argb0, + const uint8* src_argb1, int src_stride_argb1, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +// Subtract ARGB image (argb1) from ARGB image (argb0). Saturates to 0. +LIBYUV_API +int ARGBSubtract(const uint8* src_argb0, int src_stride_argb0, + const uint8* src_argb1, int src_stride_argb1, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +// Convert I422 to YUY2. +LIBYUV_API +int I422ToYUY2(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_frame, int dst_stride_frame, + int width, int height); + +// Convert I422 to UYVY. +LIBYUV_API +int I422ToUYVY(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_frame, int dst_stride_frame, + int width, int height); + +// Convert unattentuated ARGB to preattenuated ARGB. +LIBYUV_API +int ARGBAttenuate(const uint8* src_argb, int src_stride_argb, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +// Convert preattentuated ARGB to unattenuated ARGB. +LIBYUV_API +int ARGBUnattenuate(const uint8* src_argb, int src_stride_argb, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +// Convert MJPG to ARGB. +LIBYUV_API +int MJPGToARGB(const uint8* sample, size_t sample_size, + uint8* argb, int argb_stride, + int w, int h, int dw, int dh); + +// Internal function - do not call directly. +// Computes table of cumulative sum for image where the value is the sum +// of all values above and to the left of the entry. Used by ARGBBlur. +LIBYUV_API +int ARGBComputeCumulativeSum(const uint8* src_argb, int src_stride_argb, + int32* dst_cumsum, int dst_stride32_cumsum, + int width, int height); + +// Blur ARGB image. +// dst_cumsum table of width * (height + 1) * 16 bytes aligned to +// 16 byte boundary. +// dst_stride32_cumsum is number of ints in a row (width * 4). +// radius is number of pixels around the center. e.g. 1 = 3x3. 2=5x5. +// Blur is optimized for radius of 5 (11x11) or less. +LIBYUV_API +int ARGBBlur(const uint8* src_argb, int src_stride_argb, + uint8* dst_argb, int dst_stride_argb, + int32* dst_cumsum, int dst_stride32_cumsum, + int width, int height, int radius); + +// Multiply ARGB image by ARGB value. +LIBYUV_API +int ARGBShade(const uint8* src_argb, int src_stride_argb, + uint8* dst_argb, int dst_stride_argb, + int width, int height, uint32 value); + +// Interpolate between two ARGB images using specified amount of interpolation +// (0 to 255) and store to destination. +// 'interpolation' is specified as 8 bit fraction where 0 means 100% src_argb0 +// and 255 means 1% src_argb0 and 99% src_argb1. +// Internally uses ARGBScale bilinear filtering. +// Caveat: This function will write up to 16 bytes beyond the end of dst_argb. +LIBYUV_API +int ARGBInterpolate(const uint8* src_argb0, int src_stride_argb0, + const uint8* src_argb1, int src_stride_argb1, + uint8* dst_argb, int dst_stride_argb, + int width, int height, int interpolation); + +#if defined(__pnacl__) || defined(__CLR_VER) || defined(COVERAGE_ENABLED) || \ + defined(TARGET_IPHONE_SIMULATOR) +#define LIBYUV_DISABLE_X86 +#endif + +// Row functions for copying a pixels from a source with a slope to a row +// of destination. Useful for scaling, rotation, mirror, texture mapping. +LIBYUV_API +void ARGBAffineRow_C(const uint8* src_argb, int src_argb_stride, + uint8* dst_argb, const float* uv_dudv, int width); +// The following are available on all x86 platforms: +#if !defined(LIBYUV_DISABLE_X86) && \ + (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) +LIBYUV_API +void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride, + uint8* dst_argb, const float* uv_dudv, int width); +#define HAS_ARGBAFFINEROW_SSE2 +#endif // LIBYUV_DISABLE_X86 + +// Shuffle ARGB channel order. e.g. BGRA to ARGB. +// shuffler is 16 bytes and must be aligned. +LIBYUV_API +int ARGBShuffle(const uint8* src_bgra, int src_stride_bgra, + uint8* dst_argb, int dst_stride_argb, + const uint8* shuffler, int width, int height); + +// Sobel ARGB effect with planar output. +LIBYUV_API +int ARGBSobelToPlane(const uint8* src_argb, int src_stride_argb, + uint8* dst_y, int dst_stride_y, + int width, int height); + +// Sobel ARGB effect. +LIBYUV_API +int ARGBSobel(const uint8* src_argb, int src_stride_argb, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +// Sobel ARGB effect w/ Sobel X, Sobel, Sobel Y in ARGB. +LIBYUV_API +int ARGBSobelXY(const uint8* src_argb, int src_stride_argb, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif + +#endif // INCLUDE_LIBYUV_PLANAR_FUNCTIONS_H_ NOLINT diff --git a/TMessagesProj/jni/libyuv/include/libyuv/rotate.h b/TMessagesProj/jni/libyuv/include/libyuv/rotate.h new file mode 100644 index 000000000..8af60b895 --- /dev/null +++ b/TMessagesProj/jni/libyuv/include/libyuv/rotate.h @@ -0,0 +1,117 @@ +/* + * Copyright 2011 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef INCLUDE_LIBYUV_ROTATE_H_ // NOLINT +#define INCLUDE_LIBYUV_ROTATE_H_ + +#include "libyuv/basic_types.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// Supported rotation. +typedef enum RotationMode { + kRotate0 = 0, // No rotation. + kRotate90 = 90, // Rotate 90 degrees clockwise. + kRotate180 = 180, // Rotate 180 degrees. + kRotate270 = 270, // Rotate 270 degrees clockwise. + + // Deprecated. + kRotateNone = 0, + kRotateClockwise = 90, + kRotateCounterClockwise = 270, +} RotationModeEnum; + +// Rotate I420 frame. +LIBYUV_API +int I420Rotate(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int src_width, int src_height, enum RotationMode mode); + +// Rotate NV12 input and store in I420. +LIBYUV_API +int NV12ToI420Rotate(const uint8* src_y, int src_stride_y, + const uint8* src_uv, int src_stride_uv, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int src_width, int src_height, enum RotationMode mode); + +// Rotate a plane by 0, 90, 180, or 270. +LIBYUV_API +int RotatePlane(const uint8* src, int src_stride, + uint8* dst, int dst_stride, + int src_width, int src_height, enum RotationMode mode); + +// Rotate planes by 90, 180, 270. Deprecated. +LIBYUV_API +void RotatePlane90(const uint8* src, int src_stride, + uint8* dst, int dst_stride, + int width, int height); + +LIBYUV_API +void RotatePlane180(const uint8* src, int src_stride, + uint8* dst, int dst_stride, + int width, int height); + +LIBYUV_API +void RotatePlane270(const uint8* src, int src_stride, + uint8* dst, int dst_stride, + int width, int height); + +LIBYUV_API +void RotateUV90(const uint8* src, int src_stride, + uint8* dst_a, int dst_stride_a, + uint8* dst_b, int dst_stride_b, + int width, int height); + +// Rotations for when U and V are interleaved. +// These functions take one input pointer and +// split the data into two buffers while +// rotating them. Deprecated. +LIBYUV_API +void RotateUV180(const uint8* src, int src_stride, + uint8* dst_a, int dst_stride_a, + uint8* dst_b, int dst_stride_b, + int width, int height); + +LIBYUV_API +void RotateUV270(const uint8* src, int src_stride, + uint8* dst_a, int dst_stride_a, + uint8* dst_b, int dst_stride_b, + int width, int height); + +// The 90 and 270 functions are based on transposes. +// Doing a transpose with reversing the read/write +// order will result in a rotation by +- 90 degrees. +// Deprecated. +LIBYUV_API +void TransposePlane(const uint8* src, int src_stride, + uint8* dst, int dst_stride, + int width, int height); + +LIBYUV_API +void TransposeUV(const uint8* src, int src_stride, + uint8* dst_a, int dst_stride_a, + uint8* dst_b, int dst_stride_b, + int width, int height); + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif + +#endif // INCLUDE_LIBYUV_ROTATE_H_ NOLINT diff --git a/TMessagesProj/jni/libyuv/include/libyuv/rotate_argb.h b/TMessagesProj/jni/libyuv/include/libyuv/rotate_argb.h new file mode 100644 index 000000000..660ff5573 --- /dev/null +++ b/TMessagesProj/jni/libyuv/include/libyuv/rotate_argb.h @@ -0,0 +1,33 @@ +/* + * Copyright 2012 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef INCLUDE_LIBYUV_ROTATE_ARGB_H_ // NOLINT +#define INCLUDE_LIBYUV_ROTATE_ARGB_H_ + +#include "libyuv/basic_types.h" +#include "libyuv/rotate.h" // For RotationMode. + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// Rotate ARGB frame +LIBYUV_API +int ARGBRotate(const uint8* src_argb, int src_stride_argb, + uint8* dst_argb, int dst_stride_argb, + int src_width, int src_height, enum RotationMode mode); + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif + +#endif // INCLUDE_LIBYUV_ROTATE_ARGB_H_ NOLINT diff --git a/TMessagesProj/jni/libyuv/include/libyuv/row.h b/TMessagesProj/jni/libyuv/include/libyuv/row.h new file mode 100644 index 000000000..477b27447 --- /dev/null +++ b/TMessagesProj/jni/libyuv/include/libyuv/row.h @@ -0,0 +1,1821 @@ +/* + * Copyright 2011 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef INCLUDE_LIBYUV_ROW_H_ // NOLINT +#define INCLUDE_LIBYUV_ROW_H_ + +#include // For malloc. + +#include "libyuv/basic_types.h" + +#if defined(__native_client__) +#include "ppapi/c/pp_macros.h" // For PPAPI_RELEASE +#endif + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +#define IS_ALIGNED(p, a) (!((uintptr_t)(p) & ((a) - 1))) + +#ifdef __cplusplus +#define align_buffer_64(var, size) \ + uint8* var##_mem = reinterpret_cast(malloc((size) + 63)); \ + uint8* var = reinterpret_cast \ + ((reinterpret_cast(var##_mem) + 63) & ~63) +#else +#define align_buffer_64(var, size) \ + uint8* var##_mem = (uint8*)(malloc((size) + 63)); /* NOLINT */ \ + uint8* var = (uint8*)(((intptr_t)(var##_mem) + 63) & ~63) /* NOLINT */ +#endif + +#define free_aligned_buffer_64(var) \ + free(var##_mem); \ + var = 0 + +#if defined(__pnacl__) || defined(__CLR_VER) || defined(COVERAGE_ENABLED) || \ + defined(TARGET_IPHONE_SIMULATOR) || \ + (defined(_MSC_VER) && defined(__clang__)) +#define LIBYUV_DISABLE_X86 +#endif +// True if compiling for SSSE3 as a requirement. +#if defined(__SSSE3__) || (defined(_M_IX86_FP) && (_M_IX86_FP >= 3)) +#define LIBYUV_SSSE3_ONLY +#endif + +// Enable for NaCL pepper 33 for bundle and AVX2 support. +#if defined(__native_client__) && PPAPI_RELEASE >= 33 +#define NEW_BINUTILS +#endif +#if defined(__native_client__) && defined(__arm__) && PPAPI_RELEASE < 37 +#define LIBYUV_DISABLE_NEON +#endif + +// The following are available on all x86 platforms: +#if !defined(LIBYUV_DISABLE_X86) && \ + (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) +// Effects: +#define HAS_ARGBADDROW_SSE2 +#define HAS_ARGBAFFINEROW_SSE2 +#define HAS_ARGBATTENUATEROW_SSSE3 +#define HAS_ARGBBLENDROW_SSSE3 +#define HAS_ARGBCOLORMATRIXROW_SSSE3 +#define HAS_ARGBCOLORTABLEROW_X86 +#define HAS_ARGBCOPYALPHAROW_SSE2 +#define HAS_ARGBCOPYYTOALPHAROW_SSE2 +#define HAS_ARGBGRAYROW_SSSE3 +#define HAS_ARGBLUMACOLORTABLEROW_SSSE3 +#define HAS_ARGBMIRRORROW_SSSE3 +#define HAS_ARGBMULTIPLYROW_SSE2 +#define HAS_ARGBPOLYNOMIALROW_SSE2 +#define HAS_ARGBQUANTIZEROW_SSE2 +#define HAS_ARGBSEPIAROW_SSSE3 +#define HAS_ARGBSHADEROW_SSE2 +#define HAS_ARGBSUBTRACTROW_SSE2 +#define HAS_ARGBTOUVROW_SSSE3 +#define HAS_ARGBUNATTENUATEROW_SSE2 +#define HAS_COMPUTECUMULATIVESUMROW_SSE2 +#define HAS_CUMULATIVESUMTOAVERAGEROW_SSE2 +#define HAS_INTERPOLATEROW_SSE2 +#define HAS_INTERPOLATEROW_SSSE3 +#define HAS_RGBCOLORTABLEROW_X86 +#define HAS_SOBELROW_SSE2 +#define HAS_SOBELTOPLANEROW_SSE2 +#define HAS_SOBELXROW_SSE2 +#define HAS_SOBELXYROW_SSE2 +#define HAS_SOBELYROW_SSE2 + +// Conversions: +#define HAS_ABGRTOUVROW_SSSE3 +#define HAS_ABGRTOYROW_SSSE3 +#define HAS_ARGB1555TOARGBROW_SSE2 +#define HAS_ARGB4444TOARGBROW_SSE2 +#define HAS_ARGBSHUFFLEROW_SSE2 +#define HAS_ARGBSHUFFLEROW_SSSE3 +#define HAS_ARGBTOARGB1555ROW_SSE2 +#define HAS_ARGBTOARGB4444ROW_SSE2 +#define HAS_ARGBTOBAYERGGROW_SSE2 +#define HAS_ARGBTOBAYERROW_SSSE3 +#define HAS_ARGBTORAWROW_SSSE3 +#define HAS_ARGBTORGB24ROW_SSSE3 +#define HAS_ARGBTORGB565ROW_SSE2 +#define HAS_ARGBTOUV422ROW_SSSE3 +#define HAS_ARGBTOUV444ROW_SSSE3 +#define HAS_ARGBTOUVJROW_SSSE3 +#define HAS_ARGBTOYJROW_SSSE3 +#define HAS_ARGBTOYROW_SSSE3 +#define HAS_BGRATOUVROW_SSSE3 +#define HAS_BGRATOYROW_SSSE3 +#define HAS_COPYROW_ERMS +#define HAS_COPYROW_SSE2 +#define HAS_COPYROW_X86 +#define HAS_HALFROW_SSE2 +#define HAS_I400TOARGBROW_SSE2 +#define HAS_I411TOARGBROW_SSSE3 +#define HAS_I422TOARGB1555ROW_SSSE3 +#define HAS_I422TOABGRROW_SSSE3 +#define HAS_I422TOARGB1555ROW_SSSE3 +#define HAS_I422TOARGB4444ROW_SSSE3 +#define HAS_I422TOARGBROW_SSSE3 +#define HAS_I422TOBGRAROW_SSSE3 +#define HAS_I422TORAWROW_SSSE3 +#define HAS_I422TORGB24ROW_SSSE3 +#define HAS_I422TORGB565ROW_SSSE3 +#define HAS_I422TORGBAROW_SSSE3 +#define HAS_I422TOUYVYROW_SSE2 +#define HAS_I422TOYUY2ROW_SSE2 +#define HAS_I444TOARGBROW_SSSE3 +#define HAS_MERGEUVROW_SSE2 +#define HAS_MIRRORROW_SSE2 +#define HAS_MIRRORROW_SSSE3 +#define HAS_MIRRORROW_UV_SSSE3 +#define HAS_MIRRORUVROW_SSSE3 +#define HAS_NV12TOARGBROW_SSSE3 +#define HAS_NV12TORGB565ROW_SSSE3 +#define HAS_NV21TOARGBROW_SSSE3 +#define HAS_NV21TORGB565ROW_SSSE3 +#define HAS_RAWTOARGBROW_SSSE3 +#define HAS_RAWTOYROW_SSSE3 +#define HAS_RGB24TOARGBROW_SSSE3 +#define HAS_RGB24TOYROW_SSSE3 +#define HAS_RGB565TOARGBROW_SSE2 +#define HAS_RGBATOUVROW_SSSE3 +#define HAS_RGBATOYROW_SSSE3 +#define HAS_SETROW_X86 +#define HAS_SPLITUVROW_SSE2 +#define HAS_UYVYTOARGBROW_SSSE3 +#define HAS_UYVYTOUV422ROW_SSE2 +#define HAS_UYVYTOUVROW_SSE2 +#define HAS_UYVYTOYROW_SSE2 +#define HAS_YTOARGBROW_SSE2 +#define HAS_YUY2TOARGBROW_SSSE3 +#define HAS_YUY2TOUV422ROW_SSE2 +#define HAS_YUY2TOUVROW_SSE2 +#define HAS_YUY2TOYROW_SSE2 +#endif + +// The following are available on x64 Visual C: +#if !defined(LIBYUV_DISABLE_X86) && defined (_M_X64) +#define HAS_I422TOARGBROW_SSSE3 +#endif + +// GCC >= 4.7.0 required for AVX2. +#if defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__)) +#if (__GNUC__ > 4) || (__GNUC__ == 4 && (__GNUC_MINOR__ >= 7)) +#define GCC_HAS_AVX2 1 +#endif // GNUC >= 4.7 +#endif // __GNUC__ + +// clang >= 3.4.0 required for AVX2. +#if defined(__clang__) && (defined(__x86_64__) || defined(__i386__)) +#if (__clang_major__ > 3) || (__clang_major__ == 3 && (__clang_minor__ >= 4)) +#define CLANG_HAS_AVX2 1 +#endif // clang >= 3.4 +#endif // __clang__ + +// Visual C 2012 required for AVX2. +#if defined(_M_IX86) && defined(_MSC_VER) && _MSC_VER >= 1700 +#define VISUALC_HAS_AVX2 1 +#endif // VisualStudio >= 2012 + +// The following are available on all x86 platforms, but +// require VS2012, clang 3.4 or gcc 4.7. +// The code supports NaCL but requires a new compiler and validator. +#if !defined(LIBYUV_DISABLE_X86) && (defined(VISUALC_HAS_AVX2) || \ + defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2)) +// Effects: +#define HAS_ARGBPOLYNOMIALROW_AVX2 +#define HAS_ARGBSHUFFLEROW_AVX2 +#define HAS_ARGBCOPYALPHAROW_AVX2 +#define HAS_ARGBCOPYYTOALPHAROW_AVX2 +#endif + +// The following are require VS2012. +// TODO(fbarchard): Port to gcc. +#if !defined(LIBYUV_DISABLE_X86) && defined(VISUALC_HAS_AVX2) +#define HAS_ARGBTOUVROW_AVX2 +#define HAS_ARGBTOYJROW_AVX2 +#define HAS_ARGBTOYROW_AVX2 +#define HAS_HALFROW_AVX2 +#define HAS_I422TOARGBROW_AVX2 +#define HAS_INTERPOLATEROW_AVX2 +#define HAS_MERGEUVROW_AVX2 +#define HAS_MIRRORROW_AVX2 +#define HAS_SPLITUVROW_AVX2 +#define HAS_UYVYTOUV422ROW_AVX2 +#define HAS_UYVYTOUVROW_AVX2 +#define HAS_UYVYTOYROW_AVX2 +#define HAS_YUY2TOUV422ROW_AVX2 +#define HAS_YUY2TOUVROW_AVX2 +#define HAS_YUY2TOYROW_AVX2 + +// Effects: +#define HAS_ARGBADDROW_AVX2 +#define HAS_ARGBATTENUATEROW_AVX2 +#define HAS_ARGBMIRRORROW_AVX2 +#define HAS_ARGBMULTIPLYROW_AVX2 +#define HAS_ARGBSUBTRACTROW_AVX2 +#define HAS_ARGBUNATTENUATEROW_AVX2 +#endif // defined(VISUALC_HAS_AVX2) + +// The following are Yasm x86 only: +// TODO(fbarchard): Port AVX2 to inline. +#if !defined(LIBYUV_DISABLE_X86) && defined(HAVE_YASM) + (defined(_M_IX86) || defined(_M_X64) || \ + defined(__x86_64__) || defined(__i386__)) +#define HAS_MERGEUVROW_AVX2 +#define HAS_MERGEUVROW_MMX +#define HAS_SPLITUVROW_AVX2 +#define HAS_SPLITUVROW_MMX +#define HAS_UYVYTOYROW_AVX2 +#define HAS_UYVYTOYROW_MMX +#define HAS_YUY2TOYROW_AVX2 +#define HAS_YUY2TOYROW_MMX +#endif + +// The following are disabled when SSSE3 is available: +#if !defined(LIBYUV_DISABLE_X86) && \ + (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) && \ + !defined(LIBYUV_SSSE3_ONLY) +#define HAS_ARGBBLENDROW_SSE2 +#define HAS_ARGBATTENUATEROW_SSE2 +#define HAS_MIRRORROW_SSE2 +#endif + +// The following are available on arm64 platforms: +#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) +// #define HAS_I444TOARGBROW_NEON +// #define HAS_I422TOARGBROW_NEON +// #define HAS_I411TOARGBROW_NEON +// #define HAS_I422TOBGRAROW_NEON +// #define HAS_I422TOABGRROW_NEON +// #define HAS_I422TORGBAROW_NEON +// #define HAS_I422TORGB24ROW_NEON +// #define HAS_I422TORAWROW_NEON +// #define HAS_I422TORGB565ROW_NEON +// #define HAS_I422TOARGB1555ROW_NEON +// #define HAS_I422TOARGB4444ROW_NEON +// #define HAS_YTOARGBROW_NEON +// #define HAS_I400TOARGBROW_NEON +// #define HAS_NV12TOARGBROW_NEON +// #define HAS_NV21TOARGBROW_NEON +// #define HAS_NV12TORGB565ROW_NEON +// #define HAS_NV21TORGB565ROW_NEON +// #define HAS_YUY2TOARGBROW_NEON +// #define HAS_UYVYTOARGBROW_NEON +#define HAS_SPLITUVROW_NEON +#define HAS_MERGEUVROW_NEON +#define HAS_COPYROW_NEON +#define HAS_SETROW_NEON +#define HAS_ARGBSETROWS_NEON +#define HAS_MIRRORROW_NEON +#define HAS_MIRRORUVROW_NEON +#define HAS_ARGBMIRRORROW_NEON +#define HAS_RGB24TOARGBROW_NEON +#define HAS_RAWTOARGBROW_NEON +// #define HAS_RGB565TOARGBROW_NEON +// #define HAS_ARGB1555TOARGBROW_NEON +// #define HAS_ARGB4444TOARGBROW_NEON +#define HAS_ARGBTORGB24ROW_NEON +#define HAS_ARGBTORAWROW_NEON +#define HAS_YUY2TOYROW_NEON +#define HAS_UYVYTOYROW_NEON +#define HAS_YUY2TOUV422ROW_NEON +#define HAS_UYVYTOUV422ROW_NEON +#define HAS_YUY2TOUVROW_NEON +#define HAS_UYVYTOUVROW_NEON +#define HAS_HALFROW_NEON +#define HAS_ARGBTOBAYERROW_NEON +#define HAS_ARGBTOBAYERGGROW_NEON +#define HAS_ARGBSHUFFLEROW_NEON +#define HAS_I422TOYUY2ROW_NEON +#define HAS_I422TOUYVYROW_NEON +// #define HAS_ARGBTORGB565ROW_NEON +// #define HAS_ARGBTOARGB1555ROW_NEON +// #define HAS_ARGBTOARGB4444ROW_NEON +#define HAS_ARGBTOYROW_NEON +#define HAS_ARGBTOYJROW_NEON +#define HAS_ARGBTOUV444ROW_NEON +#define HAS_ARGBTOUV422ROW_NEON +#define HAS_ARGBTOUV411ROW_NEON +// #define HAS_ARGBTOUVROW_NEON +// #define HAS_ARGBTOUVJROW_NEON +// #define HAS_BGRATOUVROW_NEON +// #define HAS_ABGRTOUVROW_NEON +// #define HAS_RGBATOUVROW_NEON +// #define HAS_RGB24TOUVROW_NEON +// #define HAS_RAWTOUVROW_NEON +// #define HAS_RGB565TOUVROW_NEON +// #define HAS_ARGB1555TOUVROW_NEON +// #define HAS_ARGB4444TOUVROW_NEON +// #define HAS_RGB565TOYROW_NEON +// #define HAS_ARGB1555TOYROW_NEON +// #define HAS_ARGB4444TOYROW_NEON +#define HAS_BGRATOYROW_NEON +#define HAS_ABGRTOYROW_NEON +#define HAS_RGBATOYROW_NEON +#define HAS_RGB24TOYROW_NEON +#define HAS_RAWTOYROW_NEON +#define HAS_INTERPOLATEROW_NEON +#define HAS_ARGBBLENDROW_NEON +#define HAS_ARGBATTENUATEROW_NEON +#define HAS_ARGBQUANTIZEROW_NEON +#define HAS_ARGBSHADEROW_NEON +#define HAS_ARGBGRAYROW_NEON +#define HAS_ARGBSEPIAROW_NEON +#define HAS_ARGBCOLORMATRIXROW_NEON +#define HAS_ARGBMULTIPLYROW_NEON +#define HAS_ARGBADDROW_NEON +#define HAS_ARGBSUBTRACTROW_NEON +#define HAS_SOBELROW_NEON +#define HAS_SOBELTOPLANEROW_NEON +#define HAS_SOBELXYROW_NEON +#define HAS_SOBELXROW_NEON +#define HAS_SOBELYROW_NEON +#endif + +// The following are available on Neon platforms: +#if !defined(LIBYUV_DISABLE_NEON) && \ + (defined(__ARM_NEON__) || defined(LIBYUV_NEON)) +#define HAS_ABGRTOUVROW_NEON +#define HAS_ABGRTOYROW_NEON +#define HAS_ARGB1555TOARGBROW_NEON +#define HAS_ARGB1555TOUVROW_NEON +#define HAS_ARGB1555TOYROW_NEON +#define HAS_ARGB4444TOARGBROW_NEON +#define HAS_ARGB4444TOUVROW_NEON +#define HAS_ARGB4444TOYROW_NEON +#define HAS_ARGBTOARGB1555ROW_NEON +#define HAS_ARGBTOARGB4444ROW_NEON +#define HAS_ARGBTOBAYERROW_NEON +#define HAS_ARGBTOBAYERGGROW_NEON +#define HAS_ARGBTORAWROW_NEON +#define HAS_ARGBTORGB24ROW_NEON +#define HAS_ARGBTORGB565ROW_NEON +#define HAS_ARGBTOUV411ROW_NEON +#define HAS_ARGBTOUV422ROW_NEON +#define HAS_ARGBTOUV444ROW_NEON +#define HAS_ARGBTOUVROW_NEON +#define HAS_ARGBTOUVJROW_NEON +#define HAS_ARGBTOYROW_NEON +#define HAS_ARGBTOYJROW_NEON +#define HAS_BGRATOUVROW_NEON +#define HAS_BGRATOYROW_NEON +#define HAS_COPYROW_NEON +#define HAS_HALFROW_NEON +#define HAS_I400TOARGBROW_NEON +#define HAS_I411TOARGBROW_NEON +#define HAS_I422TOABGRROW_NEON +#define HAS_I422TOARGB1555ROW_NEON +#define HAS_I422TOARGB4444ROW_NEON +#define HAS_I422TOARGBROW_NEON +#define HAS_I422TOBGRAROW_NEON +#define HAS_I422TORAWROW_NEON +#define HAS_I422TORGB24ROW_NEON +#define HAS_I422TORGB565ROW_NEON +#define HAS_I422TORGBAROW_NEON +#define HAS_I422TOUYVYROW_NEON +#define HAS_I422TOYUY2ROW_NEON +#define HAS_I444TOARGBROW_NEON +#define HAS_MERGEUVROW_NEON +#define HAS_MIRRORROW_NEON +#define HAS_MIRRORUVROW_NEON +#define HAS_NV12TOARGBROW_NEON +#define HAS_NV12TORGB565ROW_NEON +#define HAS_NV21TOARGBROW_NEON +#define HAS_NV21TORGB565ROW_NEON +#define HAS_RAWTOARGBROW_NEON +#define HAS_RAWTOUVROW_NEON +#define HAS_RAWTOYROW_NEON +#define HAS_RGB24TOARGBROW_NEON +#define HAS_RGB24TOUVROW_NEON +#define HAS_RGB24TOYROW_NEON +#define HAS_RGB565TOARGBROW_NEON +#define HAS_RGB565TOUVROW_NEON +#define HAS_RGB565TOYROW_NEON +#define HAS_RGBATOUVROW_NEON +#define HAS_RGBATOYROW_NEON +#define HAS_SETROW_NEON +#define HAS_SPLITUVROW_NEON +#define HAS_UYVYTOARGBROW_NEON +#define HAS_UYVYTOUV422ROW_NEON +#define HAS_UYVYTOUVROW_NEON +#define HAS_UYVYTOYROW_NEON +#define HAS_YTOARGBROW_NEON +#define HAS_YUY2TOARGBROW_NEON +#define HAS_YUY2TOUV422ROW_NEON +#define HAS_YUY2TOUVROW_NEON +#define HAS_YUY2TOYROW_NEON + +// Effects: +#define HAS_ARGBADDROW_NEON +#define HAS_ARGBATTENUATEROW_NEON +#define HAS_ARGBBLENDROW_NEON +#define HAS_ARGBGRAYROW_NEON +#define HAS_ARGBMIRRORROW_NEON +#define HAS_ARGBMULTIPLYROW_NEON +#define HAS_ARGBQUANTIZEROW_NEON +#define HAS_ARGBSEPIAROW_NEON +#define HAS_ARGBSHADEROW_NEON +#define HAS_ARGBSUBTRACTROW_NEON +#define HAS_SOBELROW_NEON +#define HAS_SOBELTOPLANEROW_NEON +#define HAS_SOBELXYROW_NEON +#define HAS_SOBELXROW_NEON +#define HAS_SOBELYROW_NEON +#define HAS_INTERPOLATEROW_NEON +// TODO(fbarchard): Investigate neon unittest failure. +// #define HAS_ARGBCOLORMATRIXROW_NEON +#endif + +// The following are available on Mips platforms: +#if !defined(LIBYUV_DISABLE_MIPS) && defined(__mips__) && \ + (_MIPS_SIM == _MIPS_SIM_ABI32) && (__mips_isa_rev < 6) +#define HAS_COPYROW_MIPS +#if defined(__mips_dsp) && (__mips_dsp_rev >= 2) +#define HAS_I422TOABGRROW_MIPS_DSPR2 +#define HAS_I422TOARGBROW_MIPS_DSPR2 +#define HAS_I422TOBGRAROW_MIPS_DSPR2 +#define HAS_INTERPOLATEROWS_MIPS_DSPR2 +#define HAS_MIRRORROW_MIPS_DSPR2 +#define HAS_MIRRORUVROW_MIPS_DSPR2 +#define HAS_SPLITUVROW_MIPS_DSPR2 +#endif +#endif + +#if defined(_MSC_VER) && !defined(__CLR_VER) +#define SIMD_ALIGNED(var) __declspec(align(16)) var +typedef __declspec(align(16)) int16 vec16[8]; +typedef __declspec(align(16)) int32 vec32[4]; +typedef __declspec(align(16)) int8 vec8[16]; +typedef __declspec(align(16)) uint16 uvec16[8]; +typedef __declspec(align(16)) uint32 uvec32[4]; +typedef __declspec(align(16)) uint8 uvec8[16]; +typedef __declspec(align(32)) int16 lvec16[16]; +typedef __declspec(align(32)) int32 lvec32[8]; +typedef __declspec(align(32)) int8 lvec8[32]; +typedef __declspec(align(32)) uint16 ulvec16[16]; +typedef __declspec(align(32)) uint32 ulvec32[8]; +typedef __declspec(align(32)) uint8 ulvec8[32]; + +#elif defined(__GNUC__) +// Caveat GCC 4.2 to 4.7 have a known issue using vectors with const. +#define SIMD_ALIGNED(var) var __attribute__((aligned(16))) +typedef int16 __attribute__((vector_size(16))) vec16; +typedef int32 __attribute__((vector_size(16))) vec32; +typedef int8 __attribute__((vector_size(16))) vec8; +typedef uint16 __attribute__((vector_size(16))) uvec16; +typedef uint32 __attribute__((vector_size(16))) uvec32; +typedef uint8 __attribute__((vector_size(16))) uvec8; +#else +#define SIMD_ALIGNED(var) var +typedef int16 vec16[8]; +typedef int32 vec32[4]; +typedef int8 vec8[16]; +typedef uint16 uvec16[8]; +typedef uint32 uvec32[4]; +typedef uint8 uvec8[16]; +#endif + +#if defined(__APPLE__) || defined(__x86_64__) || defined(__llvm__) +#define OMITFP +#else +#define OMITFP __attribute__((optimize("omit-frame-pointer"))) +#endif + +// NaCL macros for GCC x86 and x64. + +// TODO(nfullagar): When pepper_33 toolchain is distributed, default to +// NEW_BINUTILS and remove all BUNDLEALIGN occurances. +#if defined(__native_client__) +#define LABELALIGN ".p2align 5\n" +#else +#define LABELALIGN ".p2align 2\n" +#endif +#if defined(__native_client__) && defined(__x86_64__) +#if defined(NEW_BINUTILS) +#define BUNDLELOCK ".bundle_lock\n" +#define BUNDLEUNLOCK ".bundle_unlock\n" +#define BUNDLEALIGN "\n" +#else +#define BUNDLELOCK "\n" +#define BUNDLEUNLOCK "\n" +#define BUNDLEALIGN ".p2align 5\n" +#endif +#define MEMACCESS(base) "%%nacl:(%%r15,%q" #base ")" +#define MEMACCESS2(offset, base) "%%nacl:" #offset "(%%r15,%q" #base ")" +#define MEMLEA(offset, base) #offset "(%q" #base ")" +#define MEMLEA3(offset, index, scale) \ + #offset "(,%q" #index "," #scale ")" +#define MEMLEA4(offset, base, index, scale) \ + #offset "(%q" #base ",%q" #index "," #scale ")" +#define MEMMOVESTRING(s, d) "%%nacl:(%q" #s "),%%nacl:(%q" #d "), %%r15" +#define MEMSTORESTRING(reg, d) "%%" #reg ",%%nacl:(%q" #d "), %%r15" +#define MEMOPREG(opcode, offset, base, index, scale, reg) \ + BUNDLELOCK \ + "lea " #offset "(%q" #base ",%q" #index "," #scale "),%%r14d\n" \ + #opcode " (%%r15,%%r14),%%" #reg "\n" \ + BUNDLEUNLOCK +#define MEMOPMEM(opcode, reg, offset, base, index, scale) \ + BUNDLELOCK \ + "lea " #offset "(%q" #base ",%q" #index "," #scale "),%%r14d\n" \ + #opcode " %%" #reg ",(%%r15,%%r14)\n" \ + BUNDLEUNLOCK +#define MEMOPARG(opcode, offset, base, index, scale, arg) \ + BUNDLELOCK \ + "lea " #offset "(%q" #base ",%q" #index "," #scale "),%%r14d\n" \ + #opcode " (%%r15,%%r14),%" #arg "\n" \ + BUNDLEUNLOCK +#else // defined(__native_client__) && defined(__x86_64__) +#define BUNDLEALIGN "\n" +#define MEMACCESS(base) "(%" #base ")" +#define MEMACCESS2(offset, base) #offset "(%" #base ")" +#define MEMLEA(offset, base) #offset "(%" #base ")" +#define MEMLEA3(offset, index, scale) \ + #offset "(,%" #index "," #scale ")" +#define MEMLEA4(offset, base, index, scale) \ + #offset "(%" #base ",%" #index "," #scale ")" +#define MEMMOVESTRING(s, d) +#define MEMSTORESTRING(reg, d) +#define MEMOPREG(opcode, offset, base, index, scale, reg) \ + #opcode " " #offset "(%" #base ",%" #index "," #scale "),%%" #reg "\n" +#define MEMOPMEM(opcode, reg, offset, base, index, scale) \ + #opcode " %%" #reg ","#offset "(%" #base ",%" #index "," #scale ")\n" +#define MEMOPARG(opcode, offset, base, index, scale, arg) \ + #opcode " " #offset "(%" #base ",%" #index "," #scale "),%" #arg "\n" +#endif // defined(__native_client__) && defined(__x86_64__) + +#if defined(__arm__) || defined(__aarch64__) +#undef MEMACCESS +#if defined(__native_client__) +#define MEMACCESS(base) ".p2align 3\nbic %" #base ", #0xc0000000\n" +#else +#define MEMACCESS(base) "\n" +#endif +#endif + +void I444ToARGBRow_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width); +void I422ToARGBRow_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width); +void I411ToARGBRow_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width); +void I422ToBGRARow_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_bgra, + int width); +void I422ToABGRRow_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_abgr, + int width); +void I422ToRGBARow_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_rgba, + int width); +void I422ToRGB24Row_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_rgb24, + int width); +void I422ToRAWRow_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_raw, + int width); +void I422ToRGB565Row_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_rgb565, + int width); +void I422ToARGB1555Row_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb1555, + int width); +void I422ToARGB4444Row_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb4444, + int width); +void NV12ToARGBRow_NEON(const uint8* src_y, + const uint8* src_uv, + uint8* dst_argb, + int width); +void NV21ToARGBRow_NEON(const uint8* src_y, + const uint8* src_vu, + uint8* dst_argb, + int width); +void NV12ToRGB565Row_NEON(const uint8* src_y, + const uint8* src_uv, + uint8* dst_rgb565, + int width); +void NV21ToRGB565Row_NEON(const uint8* src_y, + const uint8* src_vu, + uint8* dst_rgb565, + int width); +void YUY2ToARGBRow_NEON(const uint8* src_yuy2, + uint8* dst_argb, + int width); +void UYVYToARGBRow_NEON(const uint8* src_uyvy, + uint8* dst_argb, + int width); + +void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix); +void ARGBToYRow_Any_AVX2(const uint8* src_argb, uint8* dst_y, int pix); +void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix); +void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix); +void ARGBToYJRow_Any_AVX2(const uint8* src_argb, uint8* dst_y, int pix); +void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix); +void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix); +void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix); +void RGBAToYRow_SSSE3(const uint8* src_rgba, uint8* dst_y, int pix); +void RGB24ToYRow_SSSE3(const uint8* src_rgb24, uint8* dst_y, int pix); +void RAWToYRow_SSSE3(const uint8* src_raw, uint8* dst_y, int pix); +void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix); +void ARGBToYJRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix); +void BGRAToYRow_Unaligned_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix); +void ABGRToYRow_Unaligned_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix); +void RGBAToYRow_Unaligned_SSSE3(const uint8* src_rgba, uint8* dst_y, int pix); +void RGB24ToYRow_Unaligned_SSSE3(const uint8* src_rgb24, uint8* dst_y, int pix); +void RAWToYRow_Unaligned_SSSE3(const uint8* src_raw, uint8* dst_y, int pix); +void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int pix); +void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int pix); +void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, + int pix); +void ARGBToUV422Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, + int pix); +void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, + int pix); +void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int pix); +void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int pix); +void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra, + uint8* dst_u, uint8* dst_v, int pix); +void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr, + uint8* dst_u, uint8* dst_v, int pix); +void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba, + uint8* dst_u, uint8* dst_v, int pix); +void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24, + uint8* dst_u, uint8* dst_v, int pix); +void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw, + uint8* dst_u, uint8* dst_v, int pix); +void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565, + uint8* dst_u, uint8* dst_v, int pix); +void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555, + uint8* dst_u, uint8* dst_v, int pix); +void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, int src_stride_argb4444, + uint8* dst_u, uint8* dst_v, int pix); +void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int pix); +void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int pix); +void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int pix); +void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int pix); +void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int pix); +void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int pix); +void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int pix); +void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int pix); +void ARGBToYRow_C(const uint8* src_argb, uint8* dst_y, int pix); +void ARGBToYJRow_C(const uint8* src_argb, uint8* dst_y, int pix); +void BGRAToYRow_C(const uint8* src_bgra, uint8* dst_y, int pix); +void ABGRToYRow_C(const uint8* src_abgr, uint8* dst_y, int pix); +void RGBAToYRow_C(const uint8* src_rgba, uint8* dst_y, int pix); +void RGB24ToYRow_C(const uint8* src_rgb24, uint8* dst_y, int pix); +void RAWToYRow_C(const uint8* src_raw, uint8* dst_y, int pix); +void RGB565ToYRow_C(const uint8* src_rgb565, uint8* dst_y, int pix); +void ARGB1555ToYRow_C(const uint8* src_argb1555, uint8* dst_y, int pix); +void ARGB4444ToYRow_C(const uint8* src_argb4444, uint8* dst_y, int pix); +void ARGBToYRow_Any_SSSE3(const uint8* src_argb, uint8* dst_y, int pix); +void ARGBToYJRow_Any_SSSE3(const uint8* src_argb, uint8* dst_y, int pix); +void BGRAToYRow_Any_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix); +void ABGRToYRow_Any_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix); +void RGBAToYRow_Any_SSSE3(const uint8* src_rgba, uint8* dst_y, int pix); +void RGB24ToYRow_Any_SSSE3(const uint8* src_rgb24, uint8* dst_y, int pix); +void RAWToYRow_Any_SSSE3(const uint8* src_raw, uint8* dst_y, int pix); +void ARGBToYRow_Any_NEON(const uint8* src_argb, uint8* dst_y, int pix); +void ARGBToYJRow_Any_NEON(const uint8* src_argb, uint8* dst_y, int pix); +void BGRAToYRow_Any_NEON(const uint8* src_bgra, uint8* dst_y, int pix); +void ABGRToYRow_Any_NEON(const uint8* src_abgr, uint8* dst_y, int pix); +void RGBAToYRow_Any_NEON(const uint8* src_rgba, uint8* dst_y, int pix); +void RGB24ToYRow_Any_NEON(const uint8* src_rgb24, uint8* dst_y, int pix); +void RAWToYRow_Any_NEON(const uint8* src_raw, uint8* dst_y, int pix); +void RGB565ToYRow_Any_NEON(const uint8* src_rgb565, uint8* dst_y, int pix); +void ARGB1555ToYRow_Any_NEON(const uint8* src_argb1555, uint8* dst_y, int pix); +void ARGB4444ToYRow_Any_NEON(const uint8* src_argb4444, uint8* dst_y, int pix); + +void ARGBToUVRow_AVX2(const uint8* src_argb, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width); +void ARGBToUVRow_Any_AVX2(const uint8* src_argb, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width); +void ARGBToUVRow_SSSE3(const uint8* src_argb, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width); +void ARGBToUVJRow_SSSE3(const uint8* src_argb, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width); +void BGRAToUVRow_SSSE3(const uint8* src_bgra, int src_stride_bgra, + uint8* dst_u, uint8* dst_v, int width); +void ABGRToUVRow_SSSE3(const uint8* src_abgr, int src_stride_abgr, + uint8* dst_u, uint8* dst_v, int width); +void RGBAToUVRow_SSSE3(const uint8* src_rgba, int src_stride_rgba, + uint8* dst_u, uint8* dst_v, int width); +void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width); +void ARGBToUVJRow_Unaligned_SSSE3(const uint8* src_argb, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width); +void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_bgra, int src_stride_bgra, + uint8* dst_u, uint8* dst_v, int width); +void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_abgr, int src_stride_abgr, + uint8* dst_u, uint8* dst_v, int width); +void RGBAToUVRow_Unaligned_SSSE3(const uint8* src_rgba, int src_stride_rgba, + uint8* dst_u, uint8* dst_v, int width); +void ARGBToUVRow_Any_SSSE3(const uint8* src_argb, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width); +void ARGBToUVJRow_Any_SSSE3(const uint8* src_argb, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width); +void BGRAToUVRow_Any_SSSE3(const uint8* src_bgra, int src_stride_bgra, + uint8* dst_u, uint8* dst_v, int width); +void ABGRToUVRow_Any_SSSE3(const uint8* src_abgr, int src_stride_abgr, + uint8* dst_u, uint8* dst_v, int width); +void RGBAToUVRow_Any_SSSE3(const uint8* src_rgba, int src_stride_rgba, + uint8* dst_u, uint8* dst_v, int width); +void ARGBToUV444Row_Any_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, + int pix); +void ARGBToUV422Row_Any_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, + int pix); +void ARGBToUV411Row_Any_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, + int pix); +void ARGBToUVRow_Any_NEON(const uint8* src_argb, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int pix); +void ARGBToUVJRow_Any_NEON(const uint8* src_argb, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int pix); +void BGRAToUVRow_Any_NEON(const uint8* src_bgra, int src_stride_bgra, + uint8* dst_u, uint8* dst_v, int pix); +void ABGRToUVRow_Any_NEON(const uint8* src_abgr, int src_stride_abgr, + uint8* dst_u, uint8* dst_v, int pix); +void RGBAToUVRow_Any_NEON(const uint8* src_rgba, int src_stride_rgba, + uint8* dst_u, uint8* dst_v, int pix); +void RGB24ToUVRow_Any_NEON(const uint8* src_rgb24, int src_stride_rgb24, + uint8* dst_u, uint8* dst_v, int pix); +void RAWToUVRow_Any_NEON(const uint8* src_raw, int src_stride_raw, + uint8* dst_u, uint8* dst_v, int pix); +void RGB565ToUVRow_Any_NEON(const uint8* src_rgb565, int src_stride_rgb565, + uint8* dst_u, uint8* dst_v, int pix); +void ARGB1555ToUVRow_Any_NEON(const uint8* src_argb1555, + int src_stride_argb1555, + uint8* dst_u, uint8* dst_v, int pix); +void ARGB4444ToUVRow_Any_NEON(const uint8* src_argb4444, + int src_stride_argb4444, + uint8* dst_u, uint8* dst_v, int pix); +void ARGBToUVRow_C(const uint8* src_argb, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width); +void ARGBToUVJRow_C(const uint8* src_argb, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width); +void BGRAToUVRow_C(const uint8* src_bgra, int src_stride_bgra, + uint8* dst_u, uint8* dst_v, int width); +void ABGRToUVRow_C(const uint8* src_abgr, int src_stride_abgr, + uint8* dst_u, uint8* dst_v, int width); +void RGBAToUVRow_C(const uint8* src_rgba, int src_stride_rgba, + uint8* dst_u, uint8* dst_v, int width); +void RGB24ToUVRow_C(const uint8* src_rgb24, int src_stride_rgb24, + uint8* dst_u, uint8* dst_v, int width); +void RAWToUVRow_C(const uint8* src_raw, int src_stride_raw, + uint8* dst_u, uint8* dst_v, int width); +void RGB565ToUVRow_C(const uint8* src_rgb565, int src_stride_rgb565, + uint8* dst_u, uint8* dst_v, int width); +void ARGB1555ToUVRow_C(const uint8* src_argb1555, int src_stride_argb1555, + uint8* dst_u, uint8* dst_v, int width); +void ARGB4444ToUVRow_C(const uint8* src_argb4444, int src_stride_argb4444, + uint8* dst_u, uint8* dst_v, int width); + +void ARGBToUV444Row_SSSE3(const uint8* src_argb, + uint8* dst_u, uint8* dst_v, int width); +void ARGBToUV444Row_Unaligned_SSSE3(const uint8* src_argb, + uint8* dst_u, uint8* dst_v, int width); +void ARGBToUV444Row_Any_SSSE3(const uint8* src_argb, + uint8* dst_u, uint8* dst_v, int width); + +void ARGBToUV422Row_SSSE3(const uint8* src_argb, + uint8* dst_u, uint8* dst_v, int width); +void ARGBToUV422Row_Unaligned_SSSE3(const uint8* src_argb, + uint8* dst_u, uint8* dst_v, int width); +void ARGBToUV422Row_Any_SSSE3(const uint8* src_argb, + uint8* dst_u, uint8* dst_v, int width); + +void ARGBToUV444Row_C(const uint8* src_argb, + uint8* dst_u, uint8* dst_v, int width); +void ARGBToUV422Row_C(const uint8* src_argb, + uint8* dst_u, uint8* dst_v, int width); +void ARGBToUV411Row_C(const uint8* src_argb, + uint8* dst_u, uint8* dst_v, int width); + +void MirrorRow_AVX2(const uint8* src, uint8* dst, int width); +void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width); +void MirrorRow_SSE2(const uint8* src, uint8* dst, int width); +void MirrorRow_NEON(const uint8* src, uint8* dst, int width); +void MirrorRow_MIPS_DSPR2(const uint8* src, uint8* dst, int width); +void MirrorRow_C(const uint8* src, uint8* dst, int width); + +void MirrorUVRow_SSSE3(const uint8* src_uv, uint8* dst_u, uint8* dst_v, + int width); +void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, + int width); +void MirrorUVRow_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, + int width); +void MirrorUVRow_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, + int width); + +void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width); +void ARGBMirrorRow_SSSE3(const uint8* src, uint8* dst, int width); +void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width); +void ARGBMirrorRow_C(const uint8* src, uint8* dst, int width); + +void SplitUVRow_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix); +void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix); +void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix); +void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix); +void SplitUVRow_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, + int pix); +void SplitUVRow_Unaligned_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, + int pix); +void SplitUVRow_Unaligned_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, + uint8* dst_v, int pix); +void SplitUVRow_Any_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, + int pix); +void SplitUVRow_Any_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, + int pix); +void SplitUVRow_Any_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, + int pix); +void SplitUVRow_Any_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, + int pix); + +void MergeUVRow_C(const uint8* src_u, const uint8* src_v, uint8* dst_uv, + int width); +void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, + int width); +void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, + int width); +void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv, + int width); +void MergeUVRow_Unaligned_SSE2(const uint8* src_u, const uint8* src_v, + uint8* dst_uv, int width); +void MergeUVRow_Any_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, + int width); +void MergeUVRow_Any_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, + int width); +void MergeUVRow_Any_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv, + int width); + +void CopyRow_SSE2(const uint8* src, uint8* dst, int count); +void CopyRow_ERMS(const uint8* src, uint8* dst, int count); +void CopyRow_X86(const uint8* src, uint8* dst, int count); +void CopyRow_NEON(const uint8* src, uint8* dst, int count); +void CopyRow_MIPS(const uint8* src, uint8* dst, int count); +void CopyRow_C(const uint8* src, uint8* dst, int count); + +void CopyRow_16_C(const uint16* src, uint16* dst, int count); + +void ARGBCopyAlphaRow_C(const uint8* src_argb, uint8* dst_argb, int width); +void ARGBCopyAlphaRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width); +void ARGBCopyAlphaRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width); + +void ARGBCopyYToAlphaRow_C(const uint8* src_y, uint8* dst_argb, int width); +void ARGBCopyYToAlphaRow_SSE2(const uint8* src_y, uint8* dst_argb, int width); +void ARGBCopyYToAlphaRow_AVX2(const uint8* src_y, uint8* dst_argb, int width); + +void SetRow_X86(uint8* dst, uint32 v32, int count); +void ARGBSetRows_X86(uint8* dst, uint32 v32, int width, + int dst_stride, int height); +void SetRow_NEON(uint8* dst, uint32 v32, int count); +void ARGBSetRows_NEON(uint8* dst, uint32 v32, int width, + int dst_stride, int height); +void SetRow_C(uint8* dst, uint32 v32, int count); +void ARGBSetRows_C(uint8* dst, uint32 v32, int width, int dst_stride, + int height); + +// ARGBShufflers for BGRAToARGB etc. +void ARGBShuffleRow_C(const uint8* src_argb, uint8* dst_argb, + const uint8* shuffler, int pix); +void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb, + const uint8* shuffler, int pix); +void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb, + const uint8* shuffler, int pix); +void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb, + const uint8* shuffler, int pix); +void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb, + const uint8* shuffler, int pix); +void ARGBShuffleRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_argb, + const uint8* shuffler, int pix); +void ARGBShuffleRow_Any_SSE2(const uint8* src_argb, uint8* dst_argb, + const uint8* shuffler, int pix); +void ARGBShuffleRow_Any_SSSE3(const uint8* src_argb, uint8* dst_argb, + const uint8* shuffler, int pix); +void ARGBShuffleRow_Any_AVX2(const uint8* src_argb, uint8* dst_argb, + const uint8* shuffler, int pix); +void ARGBShuffleRow_Any_NEON(const uint8* src_argb, uint8* dst_argb, + const uint8* shuffler, int pix); + +void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix); +void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix); +void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb, int pix); +void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb, + int pix); +void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb, + int pix); + +void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix); +void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int pix); +void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int pix); +void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, uint8* dst_argb, + int pix); +void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, uint8* dst_argb, + int pix); +void RGB24ToARGBRow_C(const uint8* src_rgb24, uint8* dst_argb, int pix); +void RAWToARGBRow_C(const uint8* src_raw, uint8* dst_argb, int pix); +void RGB565ToARGBRow_C(const uint8* src_rgb, uint8* dst_argb, int pix); +void ARGB1555ToARGBRow_C(const uint8* src_argb, uint8* dst_argb, int pix); +void ARGB4444ToARGBRow_C(const uint8* src_argb, uint8* dst_argb, int pix); +void RGB24ToARGBRow_Any_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix); +void RAWToARGBRow_Any_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix); +void RGB565ToARGBRow_Any_SSE2(const uint8* src_rgb565, uint8* dst_argb, + int pix); +void ARGB1555ToARGBRow_Any_SSE2(const uint8* src_argb1555, uint8* dst_argb, + int pix); +void ARGB4444ToARGBRow_Any_SSE2(const uint8* src_argb4444, uint8* dst_argb, + int pix); +void RGB24ToARGBRow_Any_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix); +void RAWToARGBRow_Any_NEON(const uint8* src_raw, uint8* dst_argb, int pix); +void RGB565ToARGBRow_Any_NEON(const uint8* src_rgb565, uint8* dst_argb, + int pix); +void ARGB1555ToARGBRow_Any_NEON(const uint8* src_argb1555, uint8* dst_argb, + int pix); +void ARGB4444ToARGBRow_Any_NEON(const uint8* src_argb4444, uint8* dst_argb, + int pix); + +void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix); +void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix); +void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix); +void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix); +void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix); + +void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb, int pix); +void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_rgb, int pix); +void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb, int pix); +void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_rgb, int pix); +void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_rgb, int pix); + +void ARGBToRGBARow_C(const uint8* src_argb, uint8* dst_rgb, int pix); +void ARGBToRGB24Row_C(const uint8* src_argb, uint8* dst_rgb, int pix); +void ARGBToRAWRow_C(const uint8* src_argb, uint8* dst_rgb, int pix); +void ARGBToRGB565Row_C(const uint8* src_argb, uint8* dst_rgb, int pix); +void ARGBToARGB1555Row_C(const uint8* src_argb, uint8* dst_rgb, int pix); +void ARGBToARGB4444Row_C(const uint8* src_argb, uint8* dst_rgb, int pix); + +void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix); +void I400ToARGBRow_Unaligned_SSE2(const uint8* src_y, uint8* dst_argb, int pix); +void I400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int pix); +void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int pix); +void I400ToARGBRow_Any_SSE2(const uint8* src_y, uint8* dst_argb, int pix); +void I400ToARGBRow_Any_NEON(const uint8* src_y, uint8* dst_argb, int pix); + +void I444ToARGBRow_C(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width); +void I422ToARGBRow_C(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width); +void I411ToARGBRow_C(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width); +void NV12ToARGBRow_C(const uint8* src_y, + const uint8* src_uv, + uint8* dst_argb, + int width); +void NV21ToRGB565Row_C(const uint8* src_y, + const uint8* src_vu, + uint8* dst_argb, + int width); +void NV12ToRGB565Row_C(const uint8* src_y, + const uint8* src_uv, + uint8* dst_argb, + int width); +void NV21ToARGBRow_C(const uint8* src_y, + const uint8* src_vu, + uint8* dst_argb, + int width); +void YUY2ToARGBRow_C(const uint8* src_yuy2, + uint8* dst_argb, + int width); +void UYVYToARGBRow_C(const uint8* src_uyvy, + uint8* dst_argb, + int width); +void I422ToBGRARow_C(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_bgra, + int width); +void I422ToABGRRow_C(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_abgr, + int width); +void I422ToRGBARow_C(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_rgba, + int width); +void I422ToRGB24Row_C(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_rgb24, + int width); +void I422ToRAWRow_C(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_raw, + int width); +void I422ToARGB4444Row_C(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb4444, + int width); +void I422ToARGB1555Row_C(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb4444, + int width); +void I422ToRGB565Row_C(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_rgb565, + int width); +void YToARGBRow_C(const uint8* src_y, + uint8* dst_argb, + int width); +void I422ToARGBRow_AVX2(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width); +void I444ToARGBRow_SSSE3(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width); +void I422ToARGBRow_SSSE3(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width); +void I411ToARGBRow_SSSE3(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width); +void NV12ToARGBRow_SSSE3(const uint8* src_y, + const uint8* src_uv, + uint8* dst_argb, + int width); +void NV21ToARGBRow_SSSE3(const uint8* src_y, + const uint8* src_vu, + uint8* dst_argb, + int width); +void NV12ToRGB565Row_SSSE3(const uint8* src_y, + const uint8* src_uv, + uint8* dst_argb, + int width); +void NV21ToRGB565Row_SSSE3(const uint8* src_y, + const uint8* src_vu, + uint8* dst_argb, + int width); +void YUY2ToARGBRow_SSSE3(const uint8* src_yuy2, + uint8* dst_argb, + int width); +void UYVYToARGBRow_SSSE3(const uint8* src_uyvy, + uint8* dst_argb, + int width); +void I422ToBGRARow_SSSE3(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_bgra, + int width); +void I422ToABGRRow_SSSE3(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_abgr, + int width); +void I422ToRGBARow_SSSE3(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_rgba, + int width); +void I422ToARGB4444Row_SSSE3(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width); +void I422ToARGB1555Row_SSSE3(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width); +void I422ToRGB565Row_SSSE3(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width); +// RGB24/RAW are unaligned. +void I422ToRGB24Row_SSSE3(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_rgb24, + int width); +void I422ToRAWRow_SSSE3(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_raw, + int width); + +void I444ToARGBRow_Unaligned_SSSE3(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width); +void I422ToARGBRow_Unaligned_SSSE3(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width); +void I411ToARGBRow_Unaligned_SSSE3(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width); +void NV12ToARGBRow_Unaligned_SSSE3(const uint8* src_y, + const uint8* src_uv, + uint8* dst_argb, + int width); +void NV21ToARGBRow_Unaligned_SSSE3(const uint8* src_y, + const uint8* src_vu, + uint8* dst_argb, + int width); +void YUY2ToARGBRow_Unaligned_SSSE3(const uint8* src_yuy2, + uint8* dst_argb, + int width); +void UYVYToARGBRow_Unaligned_SSSE3(const uint8* src_uyvy, + uint8* dst_argb, + int width); +void I422ToBGRARow_Unaligned_SSSE3(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_bgra, + int width); +void I422ToABGRRow_Unaligned_SSSE3(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_abgr, + int width); +void I422ToRGBARow_Unaligned_SSSE3(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_rgba, + int width); +void I422ToARGBRow_Any_AVX2(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width); +void I444ToARGBRow_Any_SSSE3(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width); +void I422ToARGBRow_Any_SSSE3(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width); +void I411ToARGBRow_Any_SSSE3(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width); +void NV12ToARGBRow_Any_SSSE3(const uint8* src_y, + const uint8* src_uv, + uint8* dst_argb, + int width); +void NV21ToARGBRow_Any_SSSE3(const uint8* src_y, + const uint8* src_vu, + uint8* dst_argb, + int width); +void NV12ToRGB565Row_Any_SSSE3(const uint8* src_y, + const uint8* src_uv, + uint8* dst_argb, + int width); +void NV21ToRGB565Row_Any_SSSE3(const uint8* src_y, + const uint8* src_vu, + uint8* dst_argb, + int width); +void YUY2ToARGBRow_Any_SSSE3(const uint8* src_yuy2, + uint8* dst_argb, + int width); +void UYVYToARGBRow_Any_SSSE3(const uint8* src_uyvy, + uint8* dst_argb, + int width); +void I422ToBGRARow_Any_SSSE3(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_bgra, + int width); +void I422ToABGRRow_Any_SSSE3(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_abgr, + int width); +void I422ToRGBARow_Any_SSSE3(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_rgba, + int width); +void I422ToARGB4444Row_Any_SSSE3(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_rgba, + int width); +void I422ToARGB1555Row_Any_SSSE3(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_rgba, + int width); +void I422ToRGB565Row_Any_SSSE3(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_rgba, + int width); +// RGB24/RAW are unaligned. +void I422ToRGB24Row_Any_SSSE3(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width); +void I422ToRAWRow_Any_SSSE3(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width); +void YToARGBRow_SSE2(const uint8* src_y, + uint8* dst_argb, + int width); +void YToARGBRow_NEON(const uint8* src_y, + uint8* dst_argb, + int width); +void YToARGBRow_Any_SSE2(const uint8* src_y, + uint8* dst_argb, + int width); +void YToARGBRow_Any_NEON(const uint8* src_y, + uint8* dst_argb, + int width); + +// ARGB preattenuated alpha blend. +void ARGBBlendRow_SSSE3(const uint8* src_argb, const uint8* src_argb1, + uint8* dst_argb, int width); +void ARGBBlendRow_SSE2(const uint8* src_argb, const uint8* src_argb1, + uint8* dst_argb, int width); +void ARGBBlendRow_NEON(const uint8* src_argb, const uint8* src_argb1, + uint8* dst_argb, int width); +void ARGBBlendRow_C(const uint8* src_argb, const uint8* src_argb1, + uint8* dst_argb, int width); + +// ARGB multiply images. Same API as Blend, but these require +// pointer and width alignment for SSE2. +void ARGBMultiplyRow_C(const uint8* src_argb, const uint8* src_argb1, + uint8* dst_argb, int width); +void ARGBMultiplyRow_SSE2(const uint8* src_argb, const uint8* src_argb1, + uint8* dst_argb, int width); +void ARGBMultiplyRow_Any_SSE2(const uint8* src_argb, const uint8* src_argb1, + uint8* dst_argb, int width); +void ARGBMultiplyRow_AVX2(const uint8* src_argb, const uint8* src_argb1, + uint8* dst_argb, int width); +void ARGBMultiplyRow_Any_AVX2(const uint8* src_argb, const uint8* src_argb1, + uint8* dst_argb, int width); +void ARGBMultiplyRow_NEON(const uint8* src_argb, const uint8* src_argb1, + uint8* dst_argb, int width); +void ARGBMultiplyRow_Any_NEON(const uint8* src_argb, const uint8* src_argb1, + uint8* dst_argb, int width); + +// ARGB add images. +void ARGBAddRow_C(const uint8* src_argb, const uint8* src_argb1, + uint8* dst_argb, int width); +void ARGBAddRow_SSE2(const uint8* src_argb, const uint8* src_argb1, + uint8* dst_argb, int width); +void ARGBAddRow_Any_SSE2(const uint8* src_argb, const uint8* src_argb1, + uint8* dst_argb, int width); +void ARGBAddRow_AVX2(const uint8* src_argb, const uint8* src_argb1, + uint8* dst_argb, int width); +void ARGBAddRow_Any_AVX2(const uint8* src_argb, const uint8* src_argb1, + uint8* dst_argb, int width); +void ARGBAddRow_NEON(const uint8* src_argb, const uint8* src_argb1, + uint8* dst_argb, int width); +void ARGBAddRow_Any_NEON(const uint8* src_argb, const uint8* src_argb1, + uint8* dst_argb, int width); + +// ARGB subtract images. Same API as Blend, but these require +// pointer and width alignment for SSE2. +void ARGBSubtractRow_C(const uint8* src_argb, const uint8* src_argb1, + uint8* dst_argb, int width); +void ARGBSubtractRow_SSE2(const uint8* src_argb, const uint8* src_argb1, + uint8* dst_argb, int width); +void ARGBSubtractRow_Any_SSE2(const uint8* src_argb, const uint8* src_argb1, + uint8* dst_argb, int width); +void ARGBSubtractRow_AVX2(const uint8* src_argb, const uint8* src_argb1, + uint8* dst_argb, int width); +void ARGBSubtractRow_Any_AVX2(const uint8* src_argb, const uint8* src_argb1, + uint8* dst_argb, int width); +void ARGBSubtractRow_NEON(const uint8* src_argb, const uint8* src_argb1, + uint8* dst_argb, int width); +void ARGBSubtractRow_Any_NEON(const uint8* src_argb, const uint8* src_argb1, + uint8* dst_argb, int width); + +void ARGBToRGB24Row_Any_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix); +void ARGBToRAWRow_Any_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix); +void ARGBToRGB565Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix); +void ARGBToARGB1555Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix); +void ARGBToARGB4444Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix); + +void ARGBToRGB24Row_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int pix); +void ARGBToRAWRow_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int pix); +void ARGBToRGB565Row_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int pix); +void ARGBToARGB1555Row_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int pix); +void ARGBToARGB4444Row_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int pix); + +void I444ToARGBRow_Any_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width); +void I422ToARGBRow_Any_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width); +void I411ToARGBRow_Any_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width); +void I422ToBGRARow_Any_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width); +void I422ToABGRRow_Any_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width); +void I422ToRGBARow_Any_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width); +void I422ToRGB24Row_Any_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width); +void I422ToRAWRow_Any_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width); +void I422ToARGB4444Row_Any_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width); +void I422ToARGB1555Row_Any_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width); +void I422ToRGB565Row_Any_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width); +void NV12ToARGBRow_Any_NEON(const uint8* src_y, + const uint8* src_uv, + uint8* dst_argb, + int width); +void NV21ToARGBRow_Any_NEON(const uint8* src_y, + const uint8* src_uv, + uint8* dst_argb, + int width); +void NV12ToRGB565Row_Any_NEON(const uint8* src_y, + const uint8* src_uv, + uint8* dst_argb, + int width); +void NV21ToRGB565Row_Any_NEON(const uint8* src_y, + const uint8* src_uv, + uint8* dst_argb, + int width); +void YUY2ToARGBRow_Any_NEON(const uint8* src_yuy2, + uint8* dst_argb, + int width); +void UYVYToARGBRow_Any_NEON(const uint8* src_uyvy, + uint8* dst_argb, + int width); +void I422ToARGBRow_MIPS_DSPR2(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width); +void I422ToBGRARow_MIPS_DSPR2(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width); +void I422ToABGRRow_MIPS_DSPR2(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width); +void I422ToARGBRow_MIPS_DSPR2(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width); +void I422ToBGRARow_MIPS_DSPR2(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width); +void I422ToABGRRow_MIPS_DSPR2(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width); + +void YUY2ToYRow_AVX2(const uint8* src_yuy2, uint8* dst_y, int pix); +void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2, + uint8* dst_u, uint8* dst_v, int pix); +void YUY2ToUV422Row_AVX2(const uint8* src_yuy2, + uint8* dst_u, uint8* dst_v, int pix); +void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix); +void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2, + uint8* dst_u, uint8* dst_v, int pix); +void YUY2ToUV422Row_SSE2(const uint8* src_yuy2, + uint8* dst_u, uint8* dst_v, int pix); +void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2, + uint8* dst_y, int pix); +void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2, int stride_yuy2, + uint8* dst_u, uint8* dst_v, int pix); +void YUY2ToUV422Row_Unaligned_SSE2(const uint8* src_yuy2, + uint8* dst_u, uint8* dst_v, int pix); +void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int pix); +void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2, + uint8* dst_u, uint8* dst_v, int pix); +void YUY2ToUV422Row_NEON(const uint8* src_yuy2, + uint8* dst_u, uint8* dst_v, int pix); +void YUY2ToYRow_C(const uint8* src_yuy2, uint8* dst_y, int pix); +void YUY2ToUVRow_C(const uint8* src_yuy2, int stride_yuy2, + uint8* dst_u, uint8* dst_v, int pix); +void YUY2ToUV422Row_C(const uint8* src_yuy2, + uint8* dst_u, uint8* dst_v, int pix); +void YUY2ToYRow_Any_AVX2(const uint8* src_yuy2, uint8* dst_y, int pix); +void YUY2ToUVRow_Any_AVX2(const uint8* src_yuy2, int stride_yuy2, + uint8* dst_u, uint8* dst_v, int pix); +void YUY2ToUV422Row_Any_AVX2(const uint8* src_yuy2, + uint8* dst_u, uint8* dst_v, int pix); +void YUY2ToYRow_Any_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix); +void YUY2ToUVRow_Any_SSE2(const uint8* src_yuy2, int stride_yuy2, + uint8* dst_u, uint8* dst_v, int pix); +void YUY2ToUV422Row_Any_SSE2(const uint8* src_yuy2, + uint8* dst_u, uint8* dst_v, int pix); +void YUY2ToYRow_Any_NEON(const uint8* src_yuy2, uint8* dst_y, int pix); +void YUY2ToUVRow_Any_NEON(const uint8* src_yuy2, int stride_yuy2, + uint8* dst_u, uint8* dst_v, int pix); +void YUY2ToUV422Row_Any_NEON(const uint8* src_yuy2, + uint8* dst_u, uint8* dst_v, int pix); +void UYVYToYRow_AVX2(const uint8* src_uyvy, uint8* dst_y, int pix); +void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy, + uint8* dst_u, uint8* dst_v, int pix); +void UYVYToUV422Row_AVX2(const uint8* src_uyvy, + uint8* dst_u, uint8* dst_v, int pix); +void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix); +void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy, + uint8* dst_u, uint8* dst_v, int pix); +void UYVYToUV422Row_SSE2(const uint8* src_uyvy, + uint8* dst_u, uint8* dst_v, int pix); +void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy, + uint8* dst_y, int pix); +void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy, + uint8* dst_u, uint8* dst_v, int pix); +void UYVYToUV422Row_Unaligned_SSE2(const uint8* src_uyvy, + uint8* dst_u, uint8* dst_v, int pix); +void UYVYToYRow_AVX2(const uint8* src_uyvy, uint8* dst_y, int pix); +void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy, + uint8* dst_u, uint8* dst_v, int pix); +void UYVYToUV422Row_AVX2(const uint8* src_uyvy, + uint8* dst_u, uint8* dst_v, int pix); +void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int pix); +void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy, + uint8* dst_u, uint8* dst_v, int pix); +void UYVYToUV422Row_NEON(const uint8* src_uyvy, + uint8* dst_u, uint8* dst_v, int pix); + +void UYVYToYRow_C(const uint8* src_uyvy, uint8* dst_y, int pix); +void UYVYToUVRow_C(const uint8* src_uyvy, int stride_uyvy, + uint8* dst_u, uint8* dst_v, int pix); +void UYVYToUV422Row_C(const uint8* src_uyvy, + uint8* dst_u, uint8* dst_v, int pix); +void UYVYToYRow_Any_AVX2(const uint8* src_uyvy, uint8* dst_y, int pix); +void UYVYToUVRow_Any_AVX2(const uint8* src_uyvy, int stride_uyvy, + uint8* dst_u, uint8* dst_v, int pix); +void UYVYToUV422Row_Any_AVX2(const uint8* src_uyvy, + uint8* dst_u, uint8* dst_v, int pix); +void UYVYToYRow_Any_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix); +void UYVYToUVRow_Any_SSE2(const uint8* src_uyvy, int stride_uyvy, + uint8* dst_u, uint8* dst_v, int pix); +void UYVYToUV422Row_Any_SSE2(const uint8* src_uyvy, + uint8* dst_u, uint8* dst_v, int pix); +void UYVYToYRow_Any_NEON(const uint8* src_uyvy, uint8* dst_y, int pix); +void UYVYToUVRow_Any_NEON(const uint8* src_uyvy, int stride_uyvy, + uint8* dst_u, uint8* dst_v, int pix); +void UYVYToUV422Row_Any_NEON(const uint8* src_uyvy, + uint8* dst_u, uint8* dst_v, int pix); + +void HalfRow_C(const uint8* src_uv, int src_uv_stride, + uint8* dst_uv, int pix); +void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride, + uint8* dst_uv, int pix); +void HalfRow_AVX2(const uint8* src_uv, int src_uv_stride, + uint8* dst_uv, int pix); +void HalfRow_NEON(const uint8* src_uv, int src_uv_stride, + uint8* dst_uv, int pix); + +void HalfRow_16_C(const uint16* src_uv, int src_uv_stride, + uint16* dst_uv, int pix); + +void ARGBToBayerRow_C(const uint8* src_argb, uint8* dst_bayer, + uint32 selector, int pix); +void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer, + uint32 selector, int pix); +void ARGBToBayerRow_NEON(const uint8* src_argb, uint8* dst_bayer, + uint32 selector, int pix); +void ARGBToBayerRow_Any_SSSE3(const uint8* src_argb, uint8* dst_bayer, + uint32 selector, int pix); +void ARGBToBayerRow_Any_NEON(const uint8* src_argb, uint8* dst_bayer, + uint32 selector, int pix); +void ARGBToBayerGGRow_C(const uint8* src_argb, uint8* dst_bayer, + uint32 /* selector */, int pix); +void ARGBToBayerGGRow_SSE2(const uint8* src_argb, uint8* dst_bayer, + uint32 /* selector */, int pix); +void ARGBToBayerGGRow_NEON(const uint8* src_argb, uint8* dst_bayer, + uint32 /* selector */, int pix); +void ARGBToBayerGGRow_Any_SSE2(const uint8* src_argb, uint8* dst_bayer, + uint32 /* selector */, int pix); +void ARGBToBayerGGRow_Any_NEON(const uint8* src_argb, uint8* dst_bayer, + uint32 /* selector */, int pix); + +void I422ToYUY2Row_C(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_yuy2, int width); +void I422ToUYVYRow_C(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_uyvy, int width); +void I422ToYUY2Row_SSE2(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_yuy2, int width); +void I422ToUYVYRow_SSE2(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_uyvy, int width); +void I422ToYUY2Row_Any_SSE2(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_yuy2, int width); +void I422ToUYVYRow_Any_SSE2(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_uyvy, int width); +void I422ToYUY2Row_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_yuy2, int width); +void I422ToUYVYRow_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_uyvy, int width); +void I422ToYUY2Row_Any_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_yuy2, int width); +void I422ToUYVYRow_Any_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_uyvy, int width); + +// Effects related row functions. +void ARGBAttenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width); +void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width); +void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width); +void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width); +void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width); +void ARGBAttenuateRow_Any_SSE2(const uint8* src_argb, uint8* dst_argb, + int width); +void ARGBAttenuateRow_Any_SSSE3(const uint8* src_argb, uint8* dst_argb, + int width); +void ARGBAttenuateRow_Any_AVX2(const uint8* src_argb, uint8* dst_argb, + int width); +void ARGBAttenuateRow_Any_NEON(const uint8* src_argb, uint8* dst_argb, + int width); + +// Inverse table for unattenuate, shared by C and SSE2. +extern const uint32 fixed_invtbl8[256]; +void ARGBUnattenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width); +void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width); +void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width); +void ARGBUnattenuateRow_Any_SSE2(const uint8* src_argb, uint8* dst_argb, + int width); +void ARGBUnattenuateRow_Any_AVX2(const uint8* src_argb, uint8* dst_argb, + int width); + +void ARGBGrayRow_C(const uint8* src_argb, uint8* dst_argb, int width); +void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width); +void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width); + +void ARGBSepiaRow_C(uint8* dst_argb, int width); +void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width); +void ARGBSepiaRow_NEON(uint8* dst_argb, int width); + +void ARGBColorMatrixRow_C(const uint8* src_argb, uint8* dst_argb, + const int8* matrix_argb, int width); +void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb, + const int8* matrix_argb, int width); +void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb, + const int8* matrix_argb, int width); + +void ARGBColorTableRow_C(uint8* dst_argb, const uint8* table_argb, int width); +void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width); + +void RGBColorTableRow_C(uint8* dst_argb, const uint8* table_argb, int width); +void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width); + +void ARGBQuantizeRow_C(uint8* dst_argb, int scale, int interval_size, + int interval_offset, int width); +void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size, + int interval_offset, int width); +void ARGBQuantizeRow_NEON(uint8* dst_argb, int scale, int interval_size, + int interval_offset, int width); + +void ARGBShadeRow_C(const uint8* src_argb, uint8* dst_argb, int width, + uint32 value); +void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width, + uint32 value); +void ARGBShadeRow_NEON(const uint8* src_argb, uint8* dst_argb, int width, + uint32 value); + +// Used for blur. +void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft, + int width, int area, uint8* dst, int count); +void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum, + const int32* previous_cumsum, int width); + +void CumulativeSumToAverageRow_C(const int32* topleft, const int32* botleft, + int width, int area, uint8* dst, int count); +void ComputeCumulativeSumRow_C(const uint8* row, int32* cumsum, + const int32* previous_cumsum, int width); + +LIBYUV_API +void ARGBAffineRow_C(const uint8* src_argb, int src_argb_stride, + uint8* dst_argb, const float* uv_dudv, int width); +LIBYUV_API +void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride, + uint8* dst_argb, const float* uv_dudv, int width); + +// Used for I420Scale, ARGBScale, and ARGBInterpolate. +void InterpolateRow_C(uint8* dst_ptr, const uint8* src_ptr, + ptrdiff_t src_stride_ptr, + int width, int source_y_fraction); +void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr, + ptrdiff_t src_stride_ptr, int width, + int source_y_fraction); +void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr, + ptrdiff_t src_stride_ptr, int width, + int source_y_fraction); +void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr, + ptrdiff_t src_stride_ptr, int width, + int source_y_fraction); +void InterpolateRow_NEON(uint8* dst_ptr, const uint8* src_ptr, + ptrdiff_t src_stride_ptr, int width, + int source_y_fraction); +void InterpolateRows_MIPS_DSPR2(uint8* dst_ptr, const uint8* src_ptr, + ptrdiff_t src_stride_ptr, int width, + int source_y_fraction); +void InterpolateRow_Unaligned_SSE2(uint8* dst_ptr, const uint8* src_ptr, + ptrdiff_t src_stride_ptr, int width, + int source_y_fraction); +void InterpolateRow_Unaligned_SSSE3(uint8* dst_ptr, const uint8* src_ptr, + ptrdiff_t src_stride_ptr, int width, + int source_y_fraction); +void InterpolateRow_Any_NEON(uint8* dst_ptr, const uint8* src_ptr, + ptrdiff_t src_stride_ptr, int width, + int source_y_fraction); +void InterpolateRow_Any_SSE2(uint8* dst_ptr, const uint8* src_ptr, + ptrdiff_t src_stride_ptr, int width, + int source_y_fraction); +void InterpolateRow_Any_SSSE3(uint8* dst_ptr, const uint8* src_ptr, + ptrdiff_t src_stride_ptr, int width, + int source_y_fraction); +void InterpolateRow_Any_AVX2(uint8* dst_ptr, const uint8* src_ptr, + ptrdiff_t src_stride_ptr, int width, + int source_y_fraction); +void InterpolateRows_Any_MIPS_DSPR2(uint8* dst_ptr, const uint8* src_ptr, + ptrdiff_t src_stride_ptr, int width, + int source_y_fraction); + +void InterpolateRow_16_C(uint16* dst_ptr, const uint16* src_ptr, + ptrdiff_t src_stride_ptr, + int width, int source_y_fraction); + +// Sobel images. +void SobelXRow_C(const uint8* src_y0, const uint8* src_y1, const uint8* src_y2, + uint8* dst_sobelx, int width); +void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1, + const uint8* src_y2, uint8* dst_sobelx, int width); +void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1, + const uint8* src_y2, uint8* dst_sobelx, int width); +void SobelYRow_C(const uint8* src_y0, const uint8* src_y1, + uint8* dst_sobely, int width); +void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1, + uint8* dst_sobely, int width); +void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1, + uint8* dst_sobely, int width); +void SobelRow_C(const uint8* src_sobelx, const uint8* src_sobely, + uint8* dst_argb, int width); +void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, + uint8* dst_argb, int width); +void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, + uint8* dst_argb, int width); +void SobelToPlaneRow_C(const uint8* src_sobelx, const uint8* src_sobely, + uint8* dst_y, int width); +void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, + uint8* dst_y, int width); +void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, + uint8* dst_y, int width); +void SobelXYRow_C(const uint8* src_sobelx, const uint8* src_sobely, + uint8* dst_argb, int width); +void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, + uint8* dst_argb, int width); +void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, + uint8* dst_argb, int width); + +void ARGBPolynomialRow_C(const uint8* src_argb, + uint8* dst_argb, const float* poly, + int width); +void ARGBPolynomialRow_SSE2(const uint8* src_argb, + uint8* dst_argb, const float* poly, + int width); +void ARGBPolynomialRow_AVX2(const uint8* src_argb, + uint8* dst_argb, const float* poly, + int width); + +void ARGBLumaColorTableRow_C(const uint8* src_argb, uint8* dst_argb, int width, + const uint8* luma, uint32 lumacoeff); +void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb, + int width, + const uint8* luma, uint32 lumacoeff); + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif + +#endif // INCLUDE_LIBYUV_ROW_H_ NOLINT diff --git a/TMessagesProj/jni/libyuv/include/libyuv/scale.h b/TMessagesProj/jni/libyuv/include/libyuv/scale.h new file mode 100644 index 000000000..a3bc07e0f --- /dev/null +++ b/TMessagesProj/jni/libyuv/include/libyuv/scale.h @@ -0,0 +1,102 @@ +/* + * Copyright 2011 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef INCLUDE_LIBYUV_SCALE_H_ // NOLINT +#define INCLUDE_LIBYUV_SCALE_H_ + +#include "libyuv/basic_types.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// Supported filtering. +typedef enum FilterMode { + kFilterNone = 0, // Point sample; Fastest. + kFilterLinear = 1, // Filter horizontally only. + kFilterBilinear = 2, // Faster than box, but lower quality scaling down. + kFilterBox = 3 // Highest quality. +} FilterModeEnum; + +// Scale a YUV plane. +LIBYUV_API +void ScalePlane(const uint8* src, int src_stride, + int src_width, int src_height, + uint8* dst, int dst_stride, + int dst_width, int dst_height, + enum FilterMode filtering); + +void ScalePlane_16(const uint16* src, int src_stride, + int src_width, int src_height, + uint16* dst, int dst_stride, + int dst_width, int dst_height, + enum FilterMode filtering); + +// Scales a YUV 4:2:0 image from the src width and height to the +// dst width and height. +// If filtering is kFilterNone, a simple nearest-neighbor algorithm is +// used. This produces basic (blocky) quality at the fastest speed. +// If filtering is kFilterBilinear, interpolation is used to produce a better +// quality image, at the expense of speed. +// If filtering is kFilterBox, averaging is used to produce ever better +// quality image, at further expense of speed. +// Returns 0 if successful. + +LIBYUV_API +int I420Scale(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + int src_width, int src_height, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int dst_width, int dst_height, + enum FilterMode filtering); + +LIBYUV_API +int I420Scale_16(const uint16* src_y, int src_stride_y, + const uint16* src_u, int src_stride_u, + const uint16* src_v, int src_stride_v, + int src_width, int src_height, + uint16* dst_y, int dst_stride_y, + uint16* dst_u, int dst_stride_u, + uint16* dst_v, int dst_stride_v, + int dst_width, int dst_height, + enum FilterMode filtering); + +#ifdef __cplusplus +// Legacy API. Deprecated. +LIBYUV_API +int Scale(const uint8* src_y, const uint8* src_u, const uint8* src_v, + int src_stride_y, int src_stride_u, int src_stride_v, + int src_width, int src_height, + uint8* dst_y, uint8* dst_u, uint8* dst_v, + int dst_stride_y, int dst_stride_u, int dst_stride_v, + int dst_width, int dst_height, + LIBYUV_BOOL interpolate); + +// Legacy API. Deprecated. +LIBYUV_API +int ScaleOffset(const uint8* src_i420, int src_width, int src_height, + uint8* dst_i420, int dst_width, int dst_height, int dst_yoffset, + LIBYUV_BOOL interpolate); + +// For testing, allow disabling of specialized scalers. +LIBYUV_API +void SetUseReferenceImpl(LIBYUV_BOOL use); +#endif // __cplusplus + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif + +#endif // INCLUDE_LIBYUV_SCALE_H_ NOLINT diff --git a/TMessagesProj/jni/libyuv/include/libyuv/scale_argb.h b/TMessagesProj/jni/libyuv/include/libyuv/scale_argb.h new file mode 100644 index 000000000..0c9b36257 --- /dev/null +++ b/TMessagesProj/jni/libyuv/include/libyuv/scale_argb.h @@ -0,0 +1,57 @@ +/* + * Copyright 2012 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef INCLUDE_LIBYUV_SCALE_ARGB_H_ // NOLINT +#define INCLUDE_LIBYUV_SCALE_ARGB_H_ + +#include "libyuv/basic_types.h" +#include "libyuv/scale.h" // For FilterMode + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +LIBYUV_API +int ARGBScale(const uint8* src_argb, int src_stride_argb, + int src_width, int src_height, + uint8* dst_argb, int dst_stride_argb, + int dst_width, int dst_height, + enum FilterMode filtering); + +// Clipped scale takes destination rectangle coordinates for clip values. +LIBYUV_API +int ARGBScaleClip(const uint8* src_argb, int src_stride_argb, + int src_width, int src_height, + uint8* dst_argb, int dst_stride_argb, + int dst_width, int dst_height, + int clip_x, int clip_y, int clip_width, int clip_height, + enum FilterMode filtering); + +// TODO(fbarchard): Implement this. +// Scale with YUV conversion to ARGB and clipping. +LIBYUV_API +int YUVToARGBScaleClip(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint32 src_fourcc, + int src_width, int src_height, + uint8* dst_argb, int dst_stride_argb, + uint32 dst_fourcc, + int dst_width, int dst_height, + int clip_x, int clip_y, int clip_width, int clip_height, + enum FilterMode filtering); + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif + +#endif // INCLUDE_LIBYUV_SCALE_ARGB_H_ NOLINT diff --git a/TMessagesProj/jni/libyuv/include/libyuv/scale_row.h b/TMessagesProj/jni/libyuv/include/libyuv/scale_row.h new file mode 100644 index 000000000..70e6bc55b --- /dev/null +++ b/TMessagesProj/jni/libyuv/include/libyuv/scale_row.h @@ -0,0 +1,349 @@ +/* + * Copyright 2013 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef INCLUDE_LIBYUV_SCALE_ROW_H_ // NOLINT +#define INCLUDE_LIBYUV_SCALE_ROW_H_ + +#include "libyuv/basic_types.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +#if defined(__pnacl__) || defined(__CLR_VER) || defined(COVERAGE_ENABLED) || \ + defined(TARGET_IPHONE_SIMULATOR) +#define LIBYUV_DISABLE_X86 +#endif + +// The following are available on all x86 platforms: +#if !defined(LIBYUV_DISABLE_X86) && \ + (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) +#define HAS_SCALEROWDOWN2_SSE2 +#define HAS_SCALEROWDOWN4_SSE2 +#define HAS_SCALEROWDOWN34_SSSE3 +#define HAS_SCALEROWDOWN38_SSSE3 +#define HAS_SCALEADDROWS_SSE2 +#define HAS_SCALEFILTERCOLS_SSSE3 +#define HAS_SCALECOLSUP2_SSE2 +#define HAS_SCALEARGBROWDOWN2_SSE2 +#define HAS_SCALEARGBROWDOWNEVEN_SSE2 +#define HAS_SCALEARGBCOLS_SSE2 +#define HAS_SCALEARGBFILTERCOLS_SSSE3 +#define HAS_SCALEARGBCOLSUP2_SSE2 +#define HAS_FIXEDDIV_X86 +#define HAS_FIXEDDIV1_X86 +#endif + +// The following are available on Neon platforms: +#if !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \ + (defined(__ARM_NEON__) || defined(LIBYUV_NEON)) +#define HAS_SCALEROWDOWN2_NEON +#define HAS_SCALEROWDOWN4_NEON +#define HAS_SCALEROWDOWN34_NEON +#define HAS_SCALEROWDOWN38_NEON +#define HAS_SCALEARGBROWDOWNEVEN_NEON +#define HAS_SCALEARGBROWDOWN2_NEON +#elif !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \ + (defined(__aarch64__) || defined(LIBYUV_NEON)) +#define HAS_SCALEROWDOWN2_NEON +#define HAS_SCALEROWDOWN4_NEON +#define HAS_SCALEROWDOWN34_NEON +#define HAS_SCALEROWDOWN38_NEON +#define HAS_SCALEARGBROWDOWN2_NEON +#define HAS_SCALEARGBROWDOWNEVEN_NEON +#endif + +// The following are available on Mips platforms: +#if !defined(LIBYUV_DISABLE_MIPS) && !defined(__native_client__) && \ + defined(__mips__) && defined(__mips_dsp) && (__mips_dsp_rev >= 2) +#define HAS_SCALEROWDOWN2_MIPS_DSPR2 +#define HAS_SCALEROWDOWN4_MIPS_DSPR2 +#define HAS_SCALEROWDOWN34_MIPS_DSPR2 +#define HAS_SCALEROWDOWN38_MIPS_DSPR2 +#endif + +// Scale ARGB vertically with bilinear interpolation. +void ScalePlaneVertical(int src_height, + int dst_width, int dst_height, + int src_stride, int dst_stride, + const uint8* src_argb, uint8* dst_argb, + int x, int y, int dy, + int bpp, enum FilterMode filtering); + +void ScalePlaneVertical_16(int src_height, + int dst_width, int dst_height, + int src_stride, int dst_stride, + const uint16* src_argb, uint16* dst_argb, + int x, int y, int dy, + int wpp, enum FilterMode filtering); + +// Simplify the filtering based on scale factors. +enum FilterMode ScaleFilterReduce(int src_width, int src_height, + int dst_width, int dst_height, + enum FilterMode filtering); + +// Divide num by div and return as 16.16 fixed point result. +int FixedDiv_C(int num, int div); +int FixedDiv_X86(int num, int div); +// Divide num - 1 by div - 1 and return as 16.16 fixed point result. +int FixedDiv1_C(int num, int div); +int FixedDiv1_X86(int num, int div); +#ifdef HAS_FIXEDDIV_X86 +#define FixedDiv FixedDiv_X86 +#define FixedDiv1 FixedDiv1_X86 +#else +#define FixedDiv FixedDiv_C +#define FixedDiv1 FixedDiv1_C +#endif + +// Compute slope values for stepping. +void ScaleSlope(int src_width, int src_height, + int dst_width, int dst_height, + enum FilterMode filtering, + int* x, int* y, int* dx, int* dy); + +void ScaleRowDown2_C(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width); +void ScaleRowDown2_16_C(const uint16* src_ptr, ptrdiff_t src_stride, + uint16* dst, int dst_width); +void ScaleRowDown2Linear_C(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width); +void ScaleRowDown2Linear_16_C(const uint16* src_ptr, ptrdiff_t src_stride, + uint16* dst, int dst_width); +void ScaleRowDown2Box_C(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width); +void ScaleRowDown2Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride, + uint16* dst, int dst_width); +void ScaleRowDown4_C(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width); +void ScaleRowDown4_16_C(const uint16* src_ptr, ptrdiff_t src_stride, + uint16* dst, int dst_width); +void ScaleRowDown4Box_C(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width); +void ScaleRowDown4Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride, + uint16* dst, int dst_width); +void ScaleRowDown34_C(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width); +void ScaleRowDown34_16_C(const uint16* src_ptr, ptrdiff_t src_stride, + uint16* dst, int dst_width); +void ScaleRowDown34_0_Box_C(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* d, int dst_width); +void ScaleRowDown34_0_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride, + uint16* d, int dst_width); +void ScaleRowDown34_1_Box_C(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* d, int dst_width); +void ScaleRowDown34_1_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride, + uint16* d, int dst_width); +void ScaleCols_C(uint8* dst_ptr, const uint8* src_ptr, + int dst_width, int x, int dx); +void ScaleCols_16_C(uint16* dst_ptr, const uint16* src_ptr, + int dst_width, int x, int dx); +void ScaleColsUp2_C(uint8* dst_ptr, const uint8* src_ptr, + int dst_width, int, int); +void ScaleColsUp2_16_C(uint16* dst_ptr, const uint16* src_ptr, + int dst_width, int, int); +void ScaleFilterCols_C(uint8* dst_ptr, const uint8* src_ptr, + int dst_width, int x, int dx); +void ScaleFilterCols_16_C(uint16* dst_ptr, const uint16* src_ptr, + int dst_width, int x, int dx); +void ScaleFilterCols64_C(uint8* dst_ptr, const uint8* src_ptr, + int dst_width, int x, int dx); +void ScaleFilterCols64_16_C(uint16* dst_ptr, const uint16* src_ptr, + int dst_width, int x, int dx); +void ScaleRowDown38_C(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width); +void ScaleRowDown38_16_C(const uint16* src_ptr, ptrdiff_t src_stride, + uint16* dst, int dst_width); +void ScaleRowDown38_3_Box_C(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +void ScaleRowDown38_3_Box_16_C(const uint16* src_ptr, + ptrdiff_t src_stride, + uint16* dst_ptr, int dst_width); +void ScaleRowDown38_2_Box_C(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +void ScaleRowDown38_2_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride, + uint16* dst_ptr, int dst_width); +void ScaleAddRows_C(const uint8* src_ptr, ptrdiff_t src_stride, + uint16* dst_ptr, int src_width, int src_height); +void ScaleAddRows_16_C(const uint16* src_ptr, ptrdiff_t src_stride, + uint32* dst_ptr, int src_width, int src_height); +void ScaleARGBRowDown2_C(const uint8* src_argb, + ptrdiff_t src_stride, + uint8* dst_argb, int dst_width); +void ScaleARGBRowDown2Linear_C(const uint8* src_argb, + ptrdiff_t src_stride, + uint8* dst_argb, int dst_width); +void ScaleARGBRowDown2Box_C(const uint8* src_argb, ptrdiff_t src_stride, + uint8* dst_argb, int dst_width); +void ScaleARGBRowDownEven_C(const uint8* src_argb, ptrdiff_t src_stride, + int src_stepx, + uint8* dst_argb, int dst_width); +void ScaleARGBRowDownEvenBox_C(const uint8* src_argb, + ptrdiff_t src_stride, + int src_stepx, + uint8* dst_argb, int dst_width); +void ScaleARGBCols_C(uint8* dst_argb, const uint8* src_argb, + int dst_width, int x, int dx); +void ScaleARGBCols64_C(uint8* dst_argb, const uint8* src_argb, + int dst_width, int x, int dx); +void ScaleARGBColsUp2_C(uint8* dst_argb, const uint8* src_argb, + int dst_width, int, int); +void ScaleARGBFilterCols_C(uint8* dst_argb, const uint8* src_argb, + int dst_width, int x, int dx); +void ScaleARGBFilterCols64_C(uint8* dst_argb, const uint8* src_argb, + int dst_width, int x, int dx); + +void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +void ScaleRowDown2_Unaligned_SSE2(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +void ScaleRowDown2Linear_Unaligned_SSE2(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +void ScaleRowDown2Box_Unaligned_SSE2(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, + uint16* dst_ptr, int src_width, + int src_height); +void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr, + int dst_width, int x, int dx); +void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr, + int dst_width, int x, int dx); +void ScaleARGBRowDown2_SSE2(const uint8* src_argb, + ptrdiff_t src_stride, + uint8* dst_argb, int dst_width); +void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb, + ptrdiff_t src_stride, + uint8* dst_argb, int dst_width); +void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb, + ptrdiff_t src_stride, + uint8* dst_argb, int dst_width); +void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride, + int src_stepx, + uint8* dst_argb, int dst_width); +void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb, + ptrdiff_t src_stride, + int src_stepx, + uint8* dst_argb, int dst_width); +void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb, + int dst_width, int x, int dx); +void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb, + int dst_width, int x, int dx); +void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb, + int dst_width, int x, int dx); +// Row functions. +void ScaleARGBRowDownEven_NEON(const uint8* src_argb, ptrdiff_t src_stride, + int src_stepx, + uint8* dst_argb, int dst_width); +void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride, + int src_stepx, + uint8* dst_argb, int dst_width); +void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width); +void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width); + +// ScaleRowDown2Box also used by planar functions +// NEON downscalers with interpolation. + +// Note - not static due to reuse in convert for 444 to 420. +void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width); + +void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width); + +void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +void ScaleRowDown4Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); + +// Down scale from 4 to 3 pixels. Use the neon multilane read/write +// to load up the every 4th pixel into a 4 different registers. +// Point samples 32 pixels to 24 pixels. +void ScaleRowDown34_NEON(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); + +// 32 -> 12 +void ScaleRowDown38_NEON(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +// 32x3 -> 12x1 +void ScaleRowDown38_3_Box_NEON(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +// 32x2 -> 12x1 +void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); + +void ScaleRowDown2_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width); +void ScaleRowDown2Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width); +void ScaleRowDown4_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width); +void ScaleRowDown4Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width); +void ScaleRowDown34_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width); +void ScaleRowDown34_0_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* d, int dst_width); +void ScaleRowDown34_1_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* d, int dst_width); +void ScaleRowDown38_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width); +void ScaleRowDown38_2_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +void ScaleRowDown38_3_Box_MIPS_DSPR2(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif + +#endif // INCLUDE_LIBYUV_SCALE_ROW_H_ NOLINT diff --git a/TMessagesProj/jni/libyuv/include/libyuv/version.h b/TMessagesProj/jni/libyuv/include/libyuv/version.h new file mode 100644 index 000000000..c6952040b --- /dev/null +++ b/TMessagesProj/jni/libyuv/include/libyuv/version.h @@ -0,0 +1,16 @@ +/* + * Copyright 2012 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT +#define INCLUDE_LIBYUV_VERSION_H_ + +#define LIBYUV_VERSION 1074 + +#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT diff --git a/TMessagesProj/jni/libyuv/include/libyuv/video_common.h b/TMessagesProj/jni/libyuv/include/libyuv/video_common.h new file mode 100644 index 000000000..91acc2ffc --- /dev/null +++ b/TMessagesProj/jni/libyuv/include/libyuv/video_common.h @@ -0,0 +1,182 @@ +/* + * Copyright 2011 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +// Common definitions for video, including fourcc and VideoFormat. + +#ifndef INCLUDE_LIBYUV_VIDEO_COMMON_H_ // NOLINT +#define INCLUDE_LIBYUV_VIDEO_COMMON_H_ + +#include "libyuv/basic_types.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +////////////////////////////////////////////////////////////////////////////// +// Definition of FourCC codes +////////////////////////////////////////////////////////////////////////////// + +// Convert four characters to a FourCC code. +// Needs to be a macro otherwise the OS X compiler complains when the kFormat* +// constants are used in a switch. +#ifdef __cplusplus +#define FOURCC(a, b, c, d) ( \ + (static_cast(a)) | (static_cast(b) << 8) | \ + (static_cast(c) << 16) | (static_cast(d) << 24)) +#else +#define FOURCC(a, b, c, d) ( \ + ((uint32)(a)) | ((uint32)(b) << 8) | /* NOLINT */ \ + ((uint32)(c) << 16) | ((uint32)(d) << 24)) /* NOLINT */ +#endif + +// Some pages discussing FourCC codes: +// http://www.fourcc.org/yuv.php +// http://v4l2spec.bytesex.org/spec/book1.htm +// http://developer.apple.com/quicktime/icefloe/dispatch020.html +// http://msdn.microsoft.com/library/windows/desktop/dd206750.aspx#nv12 +// http://people.xiph.org/~xiphmont/containers/nut/nut4cc.txt + +// FourCC codes grouped according to implementation efficiency. +// Primary formats should convert in 1 efficient step. +// Secondary formats are converted in 2 steps. +// Auxilliary formats call primary converters. +enum FourCC { + // 9 Primary YUV formats: 5 planar, 2 biplanar, 2 packed. + FOURCC_I420 = FOURCC('I', '4', '2', '0'), + FOURCC_I422 = FOURCC('I', '4', '2', '2'), + FOURCC_I444 = FOURCC('I', '4', '4', '4'), + FOURCC_I411 = FOURCC('I', '4', '1', '1'), + FOURCC_I400 = FOURCC('I', '4', '0', '0'), + FOURCC_NV21 = FOURCC('N', 'V', '2', '1'), + FOURCC_NV12 = FOURCC('N', 'V', '1', '2'), + FOURCC_YUY2 = FOURCC('Y', 'U', 'Y', '2'), + FOURCC_UYVY = FOURCC('U', 'Y', 'V', 'Y'), + + // 2 Secondary YUV formats: row biplanar. + FOURCC_M420 = FOURCC('M', '4', '2', '0'), + FOURCC_Q420 = FOURCC('Q', '4', '2', '0'), + + // 9 Primary RGB formats: 4 32 bpp, 2 24 bpp, 3 16 bpp. + FOURCC_ARGB = FOURCC('A', 'R', 'G', 'B'), + FOURCC_BGRA = FOURCC('B', 'G', 'R', 'A'), + FOURCC_ABGR = FOURCC('A', 'B', 'G', 'R'), + FOURCC_24BG = FOURCC('2', '4', 'B', 'G'), + FOURCC_RAW = FOURCC('r', 'a', 'w', ' '), + FOURCC_RGBA = FOURCC('R', 'G', 'B', 'A'), + FOURCC_RGBP = FOURCC('R', 'G', 'B', 'P'), // rgb565 LE. + FOURCC_RGBO = FOURCC('R', 'G', 'B', 'O'), // argb1555 LE. + FOURCC_R444 = FOURCC('R', '4', '4', '4'), // argb4444 LE. + + // 4 Secondary RGB formats: 4 Bayer Patterns. + FOURCC_RGGB = FOURCC('R', 'G', 'G', 'B'), + FOURCC_BGGR = FOURCC('B', 'G', 'G', 'R'), + FOURCC_GRBG = FOURCC('G', 'R', 'B', 'G'), + FOURCC_GBRG = FOURCC('G', 'B', 'R', 'G'), + + // 1 Primary Compressed YUV format. + FOURCC_MJPG = FOURCC('M', 'J', 'P', 'G'), + + // 5 Auxiliary YUV variations: 3 with U and V planes are swapped, 1 Alias. + FOURCC_YV12 = FOURCC('Y', 'V', '1', '2'), + FOURCC_YV16 = FOURCC('Y', 'V', '1', '6'), + FOURCC_YV24 = FOURCC('Y', 'V', '2', '4'), + FOURCC_YU12 = FOURCC('Y', 'U', '1', '2'), // Linux version of I420. + FOURCC_J420 = FOURCC('J', '4', '2', '0'), + FOURCC_J400 = FOURCC('J', '4', '0', '0'), + + // 14 Auxiliary aliases. CanonicalFourCC() maps these to canonical fourcc. + FOURCC_IYUV = FOURCC('I', 'Y', 'U', 'V'), // Alias for I420. + FOURCC_YU16 = FOURCC('Y', 'U', '1', '6'), // Alias for I422. + FOURCC_YU24 = FOURCC('Y', 'U', '2', '4'), // Alias for I444. + FOURCC_YUYV = FOURCC('Y', 'U', 'Y', 'V'), // Alias for YUY2. + FOURCC_YUVS = FOURCC('y', 'u', 'v', 's'), // Alias for YUY2 on Mac. + FOURCC_HDYC = FOURCC('H', 'D', 'Y', 'C'), // Alias for UYVY. + FOURCC_2VUY = FOURCC('2', 'v', 'u', 'y'), // Alias for UYVY on Mac. + FOURCC_JPEG = FOURCC('J', 'P', 'E', 'G'), // Alias for MJPG. + FOURCC_DMB1 = FOURCC('d', 'm', 'b', '1'), // Alias for MJPG on Mac. + FOURCC_BA81 = FOURCC('B', 'A', '8', '1'), // Alias for BGGR. + FOURCC_RGB3 = FOURCC('R', 'G', 'B', '3'), // Alias for RAW. + FOURCC_BGR3 = FOURCC('B', 'G', 'R', '3'), // Alias for 24BG. + FOURCC_CM32 = FOURCC(0, 0, 0, 32), // Alias for BGRA kCMPixelFormat_32ARGB + FOURCC_CM24 = FOURCC(0, 0, 0, 24), // Alias for RAW kCMPixelFormat_24RGB + FOURCC_L555 = FOURCC('L', '5', '5', '5'), // Alias for RGBO. + FOURCC_L565 = FOURCC('L', '5', '6', '5'), // Alias for RGBP. + FOURCC_5551 = FOURCC('5', '5', '5', '1'), // Alias for RGBO. + + // 1 Auxiliary compressed YUV format set aside for capturer. + FOURCC_H264 = FOURCC('H', '2', '6', '4'), + + // Match any fourcc. + FOURCC_ANY = -1, +}; + +enum FourCCBpp { + // Canonical fourcc codes used in our code. + FOURCC_BPP_I420 = 12, + FOURCC_BPP_I422 = 16, + FOURCC_BPP_I444 = 24, + FOURCC_BPP_I411 = 12, + FOURCC_BPP_I400 = 8, + FOURCC_BPP_NV21 = 12, + FOURCC_BPP_NV12 = 12, + FOURCC_BPP_YUY2 = 16, + FOURCC_BPP_UYVY = 16, + FOURCC_BPP_M420 = 12, + FOURCC_BPP_Q420 = 12, + FOURCC_BPP_ARGB = 32, + FOURCC_BPP_BGRA = 32, + FOURCC_BPP_ABGR = 32, + FOURCC_BPP_RGBA = 32, + FOURCC_BPP_24BG = 24, + FOURCC_BPP_RAW = 24, + FOURCC_BPP_RGBP = 16, + FOURCC_BPP_RGBO = 16, + FOURCC_BPP_R444 = 16, + FOURCC_BPP_RGGB = 8, + FOURCC_BPP_BGGR = 8, + FOURCC_BPP_GRBG = 8, + FOURCC_BPP_GBRG = 8, + FOURCC_BPP_YV12 = 12, + FOURCC_BPP_YV16 = 16, + FOURCC_BPP_YV24 = 24, + FOURCC_BPP_YU12 = 12, + FOURCC_BPP_J420 = 12, + FOURCC_BPP_J400 = 8, + FOURCC_BPP_MJPG = 0, // 0 means unknown. + FOURCC_BPP_H264 = 0, + FOURCC_BPP_IYUV = 12, + FOURCC_BPP_YU16 = 16, + FOURCC_BPP_YU24 = 24, + FOURCC_BPP_YUYV = 16, + FOURCC_BPP_YUVS = 16, + FOURCC_BPP_HDYC = 16, + FOURCC_BPP_2VUY = 16, + FOURCC_BPP_JPEG = 1, + FOURCC_BPP_DMB1 = 1, + FOURCC_BPP_BA81 = 8, + FOURCC_BPP_RGB3 = 24, + FOURCC_BPP_BGR3 = 24, + FOURCC_BPP_CM32 = 32, + FOURCC_BPP_CM24 = 24, + + // Match any fourcc. + FOURCC_BPP_ANY = 0, // 0 means unknown. +}; + +// Converts fourcc aliases into canonical ones. +LIBYUV_API uint32 CanonicalFourCC(uint32 fourcc); + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif + +#endif // INCLUDE_LIBYUV_VIDEO_COMMON_H_ NOLINT diff --git a/TMessagesProj/jni/libyuv/source/compare.cc b/TMessagesProj/jni/libyuv/source/compare.cc new file mode 100644 index 000000000..dc715e019 --- /dev/null +++ b/TMessagesProj/jni/libyuv/source/compare.cc @@ -0,0 +1,325 @@ +/* + * Copyright 2011 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/compare.h" + +#include +#include +#ifdef _OPENMP +#include +#endif + +#include "libyuv/basic_types.h" +#include "libyuv/cpu_id.h" +#include "libyuv/row.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// hash seed of 5381 recommended. +// Internal C version of HashDjb2 with int sized count for efficiency. +uint32 HashDjb2_C(const uint8* src, int count, uint32 seed); + +// This module is for Visual C x86 +#if !defined(LIBYUV_DISABLE_X86) && \ + (defined(_M_IX86) || \ + (defined(__x86_64__) || (defined(__i386__) && !defined(__pic__)))) +#define HAS_HASHDJB2_SSE41 +uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed); + +#if _MSC_VER >= 1700 +#define HAS_HASHDJB2_AVX2 +uint32 HashDjb2_AVX2(const uint8* src, int count, uint32 seed); +#endif + +#endif // HAS_HASHDJB2_SSE41 + +// hash seed of 5381 recommended. +LIBYUV_API +uint32 HashDjb2(const uint8* src, uint64 count, uint32 seed) { + const int kBlockSize = 1 << 15; // 32768; + int remainder; + uint32 (*HashDjb2_SSE)(const uint8* src, int count, uint32 seed) = HashDjb2_C; +#if defined(HAS_HASHDJB2_SSE41) + if (TestCpuFlag(kCpuHasSSE41)) { + HashDjb2_SSE = HashDjb2_SSE41; + } +#endif +#if defined(HAS_HASHDJB2_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + HashDjb2_SSE = HashDjb2_AVX2; + } +#endif + + while (count >= (uint64)(kBlockSize)) { + seed = HashDjb2_SSE(src, kBlockSize, seed); + src += kBlockSize; + count -= kBlockSize; + } + remainder = (int)(count) & ~15; + if (remainder) { + seed = HashDjb2_SSE(src, remainder, seed); + src += remainder; + count -= remainder; + } + remainder = (int)(count) & 15; + if (remainder) { + seed = HashDjb2_C(src, remainder, seed); + } + return seed; +} + +uint32 SumSquareError_C(const uint8* src_a, const uint8* src_b, int count); +#if !defined(LIBYUV_DISABLE_NEON) && \ + (defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__)) +#define HAS_SUMSQUAREERROR_NEON +uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count); +#endif +#if !defined(LIBYUV_DISABLE_X86) && \ + (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) +#define HAS_SUMSQUAREERROR_SSE2 +uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count); +#endif +// Visual C 2012 required for AVX2. +#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && _MSC_VER >= 1700 +#define HAS_SUMSQUAREERROR_AVX2 +uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count); +#endif + +// TODO(fbarchard): Refactor into row function. +LIBYUV_API +uint64 ComputeSumSquareError(const uint8* src_a, const uint8* src_b, + int count) { + // SumSquareError returns values 0 to 65535 for each squared difference. + // Up to 65536 of those can be summed and remain within a uint32. + // After each block of 65536 pixels, accumulate into a uint64. + const int kBlockSize = 65536; + int remainder = count & (kBlockSize - 1) & ~31; + uint64 sse = 0; + int i; + uint32 (*SumSquareError)(const uint8* src_a, const uint8* src_b, int count) = + SumSquareError_C; +#if defined(HAS_SUMSQUAREERROR_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + SumSquareError = SumSquareError_NEON; + } +#endif +#if defined(HAS_SUMSQUAREERROR_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && + IS_ALIGNED(src_a, 16) && IS_ALIGNED(src_b, 16)) { + // Note only used for multiples of 16 so count is not checked. + SumSquareError = SumSquareError_SSE2; + } +#endif +#if defined(HAS_SUMSQUAREERROR_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + // Note only used for multiples of 32 so count is not checked. + SumSquareError = SumSquareError_AVX2; + } +#endif +#ifdef _OPENMP +#pragma omp parallel for reduction(+: sse) +#endif + for (i = 0; i < (count - (kBlockSize - 1)); i += kBlockSize) { + sse += SumSquareError(src_a + i, src_b + i, kBlockSize); + } + src_a += count & ~(kBlockSize - 1); + src_b += count & ~(kBlockSize - 1); + if (remainder) { + sse += SumSquareError(src_a, src_b, remainder); + src_a += remainder; + src_b += remainder; + } + remainder = count & 31; + if (remainder) { + sse += SumSquareError_C(src_a, src_b, remainder); + } + return sse; +} + +LIBYUV_API +uint64 ComputeSumSquareErrorPlane(const uint8* src_a, int stride_a, + const uint8* src_b, int stride_b, + int width, int height) { + uint64 sse = 0; + int h; + // Coalesce rows. + if (stride_a == width && + stride_b == width) { + width *= height; + height = 1; + stride_a = stride_b = 0; + } + for (h = 0; h < height; ++h) { + sse += ComputeSumSquareError(src_a, src_b, width); + src_a += stride_a; + src_b += stride_b; + } + return sse; +} + +LIBYUV_API +double SumSquareErrorToPsnr(uint64 sse, uint64 count) { + double psnr; + if (sse > 0) { + double mse = (double)(count) / (double)(sse); + psnr = 10.0 * log10(255.0 * 255.0 * mse); + } else { + psnr = kMaxPsnr; // Limit to prevent divide by 0 + } + + if (psnr > kMaxPsnr) + psnr = kMaxPsnr; + + return psnr; +} + +LIBYUV_API +double CalcFramePsnr(const uint8* src_a, int stride_a, + const uint8* src_b, int stride_b, + int width, int height) { + const uint64 samples = width * height; + const uint64 sse = ComputeSumSquareErrorPlane(src_a, stride_a, + src_b, stride_b, + width, height); + return SumSquareErrorToPsnr(sse, samples); +} + +LIBYUV_API +double I420Psnr(const uint8* src_y_a, int stride_y_a, + const uint8* src_u_a, int stride_u_a, + const uint8* src_v_a, int stride_v_a, + const uint8* src_y_b, int stride_y_b, + const uint8* src_u_b, int stride_u_b, + const uint8* src_v_b, int stride_v_b, + int width, int height) { + const uint64 sse_y = ComputeSumSquareErrorPlane(src_y_a, stride_y_a, + src_y_b, stride_y_b, + width, height); + const int width_uv = (width + 1) >> 1; + const int height_uv = (height + 1) >> 1; + const uint64 sse_u = ComputeSumSquareErrorPlane(src_u_a, stride_u_a, + src_u_b, stride_u_b, + width_uv, height_uv); + const uint64 sse_v = ComputeSumSquareErrorPlane(src_v_a, stride_v_a, + src_v_b, stride_v_b, + width_uv, height_uv); + const uint64 samples = width * height + 2 * (width_uv * height_uv); + const uint64 sse = sse_y + sse_u + sse_v; + return SumSquareErrorToPsnr(sse, samples); +} + +static const int64 cc1 = 26634; // (64^2*(.01*255)^2 +static const int64 cc2 = 239708; // (64^2*(.03*255)^2 + +static double Ssim8x8_C(const uint8* src_a, int stride_a, + const uint8* src_b, int stride_b) { + int64 sum_a = 0; + int64 sum_b = 0; + int64 sum_sq_a = 0; + int64 sum_sq_b = 0; + int64 sum_axb = 0; + + int i; + for (i = 0; i < 8; ++i) { + int j; + for (j = 0; j < 8; ++j) { + sum_a += src_a[j]; + sum_b += src_b[j]; + sum_sq_a += src_a[j] * src_a[j]; + sum_sq_b += src_b[j] * src_b[j]; + sum_axb += src_a[j] * src_b[j]; + } + + src_a += stride_a; + src_b += stride_b; + } + + { + const int64 count = 64; + // scale the constants by number of pixels + const int64 c1 = (cc1 * count * count) >> 12; + const int64 c2 = (cc2 * count * count) >> 12; + + const int64 sum_a_x_sum_b = sum_a * sum_b; + + const int64 ssim_n = (2 * sum_a_x_sum_b + c1) * + (2 * count * sum_axb - 2 * sum_a_x_sum_b + c2); + + const int64 sum_a_sq = sum_a*sum_a; + const int64 sum_b_sq = sum_b*sum_b; + + const int64 ssim_d = (sum_a_sq + sum_b_sq + c1) * + (count * sum_sq_a - sum_a_sq + + count * sum_sq_b - sum_b_sq + c2); + + if (ssim_d == 0.0) { + return DBL_MAX; + } + return ssim_n * 1.0 / ssim_d; + } +} + +// We are using a 8x8 moving window with starting location of each 8x8 window +// on the 4x4 pixel grid. Such arrangement allows the windows to overlap +// block boundaries to penalize blocking artifacts. +LIBYUV_API +double CalcFrameSsim(const uint8* src_a, int stride_a, + const uint8* src_b, int stride_b, + int width, int height) { + int samples = 0; + double ssim_total = 0; + double (*Ssim8x8)(const uint8* src_a, int stride_a, + const uint8* src_b, int stride_b) = Ssim8x8_C; + + // sample point start with each 4x4 location + int i; + for (i = 0; i < height - 8; i += 4) { + int j; + for (j = 0; j < width - 8; j += 4) { + ssim_total += Ssim8x8(src_a + j, stride_a, src_b + j, stride_b); + samples++; + } + + src_a += stride_a * 4; + src_b += stride_b * 4; + } + + ssim_total /= samples; + return ssim_total; +} + +LIBYUV_API +double I420Ssim(const uint8* src_y_a, int stride_y_a, + const uint8* src_u_a, int stride_u_a, + const uint8* src_v_a, int stride_v_a, + const uint8* src_y_b, int stride_y_b, + const uint8* src_u_b, int stride_u_b, + const uint8* src_v_b, int stride_v_b, + int width, int height) { + const double ssim_y = CalcFrameSsim(src_y_a, stride_y_a, + src_y_b, stride_y_b, width, height); + const int width_uv = (width + 1) >> 1; + const int height_uv = (height + 1) >> 1; + const double ssim_u = CalcFrameSsim(src_u_a, stride_u_a, + src_u_b, stride_u_b, + width_uv, height_uv); + const double ssim_v = CalcFrameSsim(src_v_a, stride_v_a, + src_v_b, stride_v_b, + width_uv, height_uv); + return ssim_y * 0.8 + 0.1 * (ssim_u + ssim_v); +} + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/TMessagesProj/jni/libyuv/source/compare_common.cc b/TMessagesProj/jni/libyuv/source/compare_common.cc new file mode 100644 index 000000000..c546b5182 --- /dev/null +++ b/TMessagesProj/jni/libyuv/source/compare_common.cc @@ -0,0 +1,42 @@ +/* + * Copyright 2012 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/basic_types.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +uint32 SumSquareError_C(const uint8* src_a, const uint8* src_b, int count) { + uint32 sse = 0u; + int i; + for (i = 0; i < count; ++i) { + int diff = src_a[i] - src_b[i]; + sse += (uint32)(diff * diff); + } + return sse; +} + +// hash seed of 5381 recommended. +// Internal C version of HashDjb2 with int sized count for efficiency. +uint32 HashDjb2_C(const uint8* src, int count, uint32 seed) { + uint32 hash = seed; + int i; + for (i = 0; i < count; ++i) { + hash += (hash << 5) + src[i]; + } + return hash; +} + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/TMessagesProj/jni/libyuv/source/compare_neon.cc b/TMessagesProj/jni/libyuv/source/compare_neon.cc new file mode 100644 index 000000000..55052c0ee --- /dev/null +++ b/TMessagesProj/jni/libyuv/source/compare_neon.cc @@ -0,0 +1,103 @@ +/* + * Copyright 2012 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/basic_types.h" +#include "libyuv/row.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) + +uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) { + volatile uint32 sse; + asm volatile ( + "vmov.u8 q8, #0 \n" + "vmov.u8 q10, #0 \n" + "vmov.u8 q9, #0 \n" + "vmov.u8 q11, #0 \n" + + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld1.8 {q0}, [%0]! \n" + MEMACCESS(1) + "vld1.8 {q1}, [%1]! \n" + "subs %2, %2, #16 \n" + "vsubl.u8 q2, d0, d2 \n" + "vsubl.u8 q3, d1, d3 \n" + "vmlal.s16 q8, d4, d4 \n" + "vmlal.s16 q9, d6, d6 \n" + "vmlal.s16 q10, d5, d5 \n" + "vmlal.s16 q11, d7, d7 \n" + "bgt 1b \n" + + "vadd.u32 q8, q8, q9 \n" + "vadd.u32 q10, q10, q11 \n" + "vadd.u32 q11, q8, q10 \n" + "vpaddl.u32 q1, q11 \n" + "vadd.u64 d0, d2, d3 \n" + "vmov.32 %3, d0[0] \n" + : "+r"(src_a), + "+r"(src_b), + "+r"(count), + "=r"(sse) + : + : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"); + return sse; +} + +#elif !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) + +uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) { + volatile uint32 sse; + asm volatile ( + "eor v16.16b, v16.16b, v16.16b \n" + "eor v18.16b, v18.16b, v18.16b \n" + "eor v17.16b, v17.16b, v17.16b \n" + "eor v19.16b, v19.16b, v19.16b \n" + + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "ld1 {v0.16b}, [%0], #16 \n" + MEMACCESS(1) + "ld1 {v1.16b}, [%1], #16 \n" + "subs %2, %2, #16 \n" + "usubl v2.8h, v0.8b, v1.8b \n" + "usubl2 v3.8h, v0.16b, v1.16b \n" + "smlal v16.4s, v2.4h, v2.4h \n" + "smlal v17.4s, v3.4h, v3.4h \n" + "smlal2 v18.4s, v2.8h, v2.8h \n" + "smlal2 v19.4s, v3.8h, v3.8h \n" + "bgt 1b \n" + + "add v16.4s, v16.4s, v17.4s \n" + "add v18.4s, v18.4s, v19.4s \n" + "add v19.4s, v16.4s, v18.4s \n" + "addv s0, v19.4s \n" + "fmov %w3, s0 \n" + : "+r"(src_a), + "+r"(src_b), + "+r"(count), + "=r"(sse) + : + : "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19"); + return sse; +} + +#endif // __ARM_NEON__ + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/TMessagesProj/jni/libyuv/source/compare_posix.cc b/TMessagesProj/jni/libyuv/source/compare_posix.cc new file mode 100644 index 000000000..ac361190e --- /dev/null +++ b/TMessagesProj/jni/libyuv/source/compare_posix.cc @@ -0,0 +1,158 @@ +/* + * Copyright 2012 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/basic_types.h" +#include "libyuv/row.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +#if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__)) + +uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) { + uint32 sse; + asm volatile ( // NOLINT + "pxor %%xmm0,%%xmm0 \n" + "pxor %%xmm5,%%xmm5 \n" + LABELALIGN + "1: \n" + "movdqa " MEMACCESS(0) ",%%xmm1 \n" + "lea " MEMLEA(0x10, 0) ",%0 \n" + "movdqa " MEMACCESS(1) ",%%xmm2 \n" + "lea " MEMLEA(0x10, 1) ",%1 \n" + "sub $0x10,%2 \n" + "movdqa %%xmm1,%%xmm3 \n" + "psubusb %%xmm2,%%xmm1 \n" + "psubusb %%xmm3,%%xmm2 \n" + "por %%xmm2,%%xmm1 \n" + "movdqa %%xmm1,%%xmm2 \n" + "punpcklbw %%xmm5,%%xmm1 \n" + "punpckhbw %%xmm5,%%xmm2 \n" + "pmaddwd %%xmm1,%%xmm1 \n" + "pmaddwd %%xmm2,%%xmm2 \n" + "paddd %%xmm1,%%xmm0 \n" + "paddd %%xmm2,%%xmm0 \n" + "jg 1b \n" + + "pshufd $0xee,%%xmm0,%%xmm1 \n" + "paddd %%xmm1,%%xmm0 \n" + "pshufd $0x1,%%xmm0,%%xmm1 \n" + "paddd %%xmm1,%%xmm0 \n" + "movd %%xmm0,%3 \n" + + : "+r"(src_a), // %0 + "+r"(src_b), // %1 + "+r"(count), // %2 + "=g"(sse) // %3 + : + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" +#endif + ); // NOLINT + return sse; +} + +#endif // defined(__x86_64__) || defined(__i386__) + +#if !defined(LIBYUV_DISABLE_X86) && \ + (defined(__x86_64__) || (defined(__i386__) && !defined(__pic__))) +#define HAS_HASHDJB2_SSE41 +static uvec32 kHash16x33 = { 0x92d9e201, 0, 0, 0 }; // 33 ^ 16 +static uvec32 kHashMul0 = { + 0x0c3525e1, // 33 ^ 15 + 0xa3476dc1, // 33 ^ 14 + 0x3b4039a1, // 33 ^ 13 + 0x4f5f0981, // 33 ^ 12 +}; +static uvec32 kHashMul1 = { + 0x30f35d61, // 33 ^ 11 + 0x855cb541, // 33 ^ 10 + 0x040a9121, // 33 ^ 9 + 0x747c7101, // 33 ^ 8 +}; +static uvec32 kHashMul2 = { + 0xec41d4e1, // 33 ^ 7 + 0x4cfa3cc1, // 33 ^ 6 + 0x025528a1, // 33 ^ 5 + 0x00121881, // 33 ^ 4 +}; +static uvec32 kHashMul3 = { + 0x00008c61, // 33 ^ 3 + 0x00000441, // 33 ^ 2 + 0x00000021, // 33 ^ 1 + 0x00000001, // 33 ^ 0 +}; + +uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) { + uint32 hash; + asm volatile ( // NOLINT + "movd %2,%%xmm0 \n" + "pxor %%xmm7,%%xmm7 \n" + "movdqa %4,%%xmm6 \n" + LABELALIGN + "1: \n" + "movdqu " MEMACCESS(0) ",%%xmm1 \n" + "lea " MEMLEA(0x10, 0) ",%0 \n" + "pmulld %%xmm6,%%xmm0 \n" + "movdqa %5,%%xmm5 \n" + "movdqa %%xmm1,%%xmm2 \n" + "punpcklbw %%xmm7,%%xmm2 \n" + "movdqa %%xmm2,%%xmm3 \n" + "punpcklwd %%xmm7,%%xmm3 \n" + "pmulld %%xmm5,%%xmm3 \n" + "movdqa %6,%%xmm5 \n" + "movdqa %%xmm2,%%xmm4 \n" + "punpckhwd %%xmm7,%%xmm4 \n" + "pmulld %%xmm5,%%xmm4 \n" + "movdqa %7,%%xmm5 \n" + "punpckhbw %%xmm7,%%xmm1 \n" + "movdqa %%xmm1,%%xmm2 \n" + "punpcklwd %%xmm7,%%xmm2 \n" + "pmulld %%xmm5,%%xmm2 \n" + "movdqa %8,%%xmm5 \n" + "punpckhwd %%xmm7,%%xmm1 \n" + "pmulld %%xmm5,%%xmm1 \n" + "paddd %%xmm4,%%xmm3 \n" + "paddd %%xmm2,%%xmm1 \n" + "sub $0x10,%1 \n" + "paddd %%xmm3,%%xmm1 \n" + "pshufd $0xe,%%xmm1,%%xmm2 \n" + "paddd %%xmm2,%%xmm1 \n" + "pshufd $0x1,%%xmm1,%%xmm2 \n" + "paddd %%xmm2,%%xmm1 \n" + "paddd %%xmm1,%%xmm0 \n" + "jg 1b \n" + "movd %%xmm0,%3 \n" + : "+r"(src), // %0 + "+r"(count), // %1 + "+rm"(seed), // %2 + "=g"(hash) // %3 + : "m"(kHash16x33), // %4 + "m"(kHashMul0), // %5 + "m"(kHashMul1), // %6 + "m"(kHashMul2), // %7 + "m"(kHashMul3) // %8 + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" +#endif + ); // NOLINT + return hash; +} +#endif // defined(__x86_64__) || (defined(__i386__) && !defined(__pic__))) + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif + diff --git a/TMessagesProj/jni/libyuv/source/compare_win.cc b/TMessagesProj/jni/libyuv/source/compare_win.cc new file mode 100644 index 000000000..99831651f --- /dev/null +++ b/TMessagesProj/jni/libyuv/source/compare_win.cc @@ -0,0 +1,232 @@ +/* + * Copyright 2012 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/basic_types.h" +#include "libyuv/row.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER) + +__declspec(naked) __declspec(align(16)) +uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) { + __asm { + mov eax, [esp + 4] // src_a + mov edx, [esp + 8] // src_b + mov ecx, [esp + 12] // count + pxor xmm0, xmm0 + pxor xmm5, xmm5 + + align 4 + wloop: + movdqa xmm1, [eax] + lea eax, [eax + 16] + movdqa xmm2, [edx] + lea edx, [edx + 16] + sub ecx, 16 + movdqa xmm3, xmm1 // abs trick + psubusb xmm1, xmm2 + psubusb xmm2, xmm3 + por xmm1, xmm2 + movdqa xmm2, xmm1 + punpcklbw xmm1, xmm5 + punpckhbw xmm2, xmm5 + pmaddwd xmm1, xmm1 + pmaddwd xmm2, xmm2 + paddd xmm0, xmm1 + paddd xmm0, xmm2 + jg wloop + + pshufd xmm1, xmm0, 0xee + paddd xmm0, xmm1 + pshufd xmm1, xmm0, 0x01 + paddd xmm0, xmm1 + movd eax, xmm0 + ret + } +} + +// Visual C 2012 required for AVX2. +#if _MSC_VER >= 1700 +// C4752: found Intel(R) Advanced Vector Extensions; consider using /arch:AVX. +#pragma warning(disable: 4752) +__declspec(naked) __declspec(align(16)) +uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count) { + __asm { + mov eax, [esp + 4] // src_a + mov edx, [esp + 8] // src_b + mov ecx, [esp + 12] // count + vpxor ymm0, ymm0, ymm0 // sum + vpxor ymm5, ymm5, ymm5 // constant 0 for unpck + sub edx, eax + + align 4 + wloop: + vmovdqu ymm1, [eax] + vmovdqu ymm2, [eax + edx] + lea eax, [eax + 32] + sub ecx, 32 + vpsubusb ymm3, ymm1, ymm2 // abs difference trick + vpsubusb ymm2, ymm2, ymm1 + vpor ymm1, ymm2, ymm3 + vpunpcklbw ymm2, ymm1, ymm5 // u16. mutates order. + vpunpckhbw ymm1, ymm1, ymm5 + vpmaddwd ymm2, ymm2, ymm2 // square + hadd to u32. + vpmaddwd ymm1, ymm1, ymm1 + vpaddd ymm0, ymm0, ymm1 + vpaddd ymm0, ymm0, ymm2 + jg wloop + + vpshufd ymm1, ymm0, 0xee // 3, 2 + 1, 0 both lanes. + vpaddd ymm0, ymm0, ymm1 + vpshufd ymm1, ymm0, 0x01 // 1 + 0 both lanes. + vpaddd ymm0, ymm0, ymm1 + vpermq ymm1, ymm0, 0x02 // high + low lane. + vpaddd ymm0, ymm0, ymm1 + vmovd eax, xmm0 + vzeroupper + ret + } +} +#endif // _MSC_VER >= 1700 + +#define HAS_HASHDJB2_SSE41 +static uvec32 kHash16x33 = { 0x92d9e201, 0, 0, 0 }; // 33 ^ 16 +static uvec32 kHashMul0 = { + 0x0c3525e1, // 33 ^ 15 + 0xa3476dc1, // 33 ^ 14 + 0x3b4039a1, // 33 ^ 13 + 0x4f5f0981, // 33 ^ 12 +}; +static uvec32 kHashMul1 = { + 0x30f35d61, // 33 ^ 11 + 0x855cb541, // 33 ^ 10 + 0x040a9121, // 33 ^ 9 + 0x747c7101, // 33 ^ 8 +}; +static uvec32 kHashMul2 = { + 0xec41d4e1, // 33 ^ 7 + 0x4cfa3cc1, // 33 ^ 6 + 0x025528a1, // 33 ^ 5 + 0x00121881, // 33 ^ 4 +}; +static uvec32 kHashMul3 = { + 0x00008c61, // 33 ^ 3 + 0x00000441, // 33 ^ 2 + 0x00000021, // 33 ^ 1 + 0x00000001, // 33 ^ 0 +}; + +// 27: 66 0F 38 40 C6 pmulld xmm0,xmm6 +// 44: 66 0F 38 40 DD pmulld xmm3,xmm5 +// 59: 66 0F 38 40 E5 pmulld xmm4,xmm5 +// 72: 66 0F 38 40 D5 pmulld xmm2,xmm5 +// 83: 66 0F 38 40 CD pmulld xmm1,xmm5 +#define pmulld(reg) _asm _emit 0x66 _asm _emit 0x0F _asm _emit 0x38 \ + _asm _emit 0x40 _asm _emit reg + +__declspec(naked) __declspec(align(16)) +uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) { + __asm { + mov eax, [esp + 4] // src + mov ecx, [esp + 8] // count + movd xmm0, [esp + 12] // seed + + pxor xmm7, xmm7 // constant 0 for unpck + movdqa xmm6, kHash16x33 + + align 4 + wloop: + movdqu xmm1, [eax] // src[0-15] + lea eax, [eax + 16] + pmulld(0xc6) // pmulld xmm0,xmm6 hash *= 33 ^ 16 + movdqa xmm5, kHashMul0 + movdqa xmm2, xmm1 + punpcklbw xmm2, xmm7 // src[0-7] + movdqa xmm3, xmm2 + punpcklwd xmm3, xmm7 // src[0-3] + pmulld(0xdd) // pmulld xmm3, xmm5 + movdqa xmm5, kHashMul1 + movdqa xmm4, xmm2 + punpckhwd xmm4, xmm7 // src[4-7] + pmulld(0xe5) // pmulld xmm4, xmm5 + movdqa xmm5, kHashMul2 + punpckhbw xmm1, xmm7 // src[8-15] + movdqa xmm2, xmm1 + punpcklwd xmm2, xmm7 // src[8-11] + pmulld(0xd5) // pmulld xmm2, xmm5 + movdqa xmm5, kHashMul3 + punpckhwd xmm1, xmm7 // src[12-15] + pmulld(0xcd) // pmulld xmm1, xmm5 + paddd xmm3, xmm4 // add 16 results + paddd xmm1, xmm2 + sub ecx, 16 + paddd xmm1, xmm3 + + pshufd xmm2, xmm1, 0x0e // upper 2 dwords + paddd xmm1, xmm2 + pshufd xmm2, xmm1, 0x01 + paddd xmm1, xmm2 + paddd xmm0, xmm1 + jg wloop + + movd eax, xmm0 // return hash + ret + } +} + +// Visual C 2012 required for AVX2. +#if _MSC_VER >= 1700 +__declspec(naked) __declspec(align(16)) +uint32 HashDjb2_AVX2(const uint8* src, int count, uint32 seed) { + __asm { + mov eax, [esp + 4] // src + mov ecx, [esp + 8] // count + movd xmm0, [esp + 12] // seed + movdqa xmm6, kHash16x33 + + align 4 + wloop: + vpmovzxbd xmm3, dword ptr [eax] // src[0-3] + pmulld xmm0, xmm6 // hash *= 33 ^ 16 + vpmovzxbd xmm4, dword ptr [eax + 4] // src[4-7] + pmulld xmm3, kHashMul0 + vpmovzxbd xmm2, dword ptr [eax + 8] // src[8-11] + pmulld xmm4, kHashMul1 + vpmovzxbd xmm1, dword ptr [eax + 12] // src[12-15] + pmulld xmm2, kHashMul2 + lea eax, [eax + 16] + pmulld xmm1, kHashMul3 + paddd xmm3, xmm4 // add 16 results + paddd xmm1, xmm2 + sub ecx, 16 + paddd xmm1, xmm3 + pshufd xmm2, xmm1, 0x0e // upper 2 dwords + paddd xmm1, xmm2 + pshufd xmm2, xmm1, 0x01 + paddd xmm1, xmm2 + paddd xmm0, xmm1 + jg wloop + + movd eax, xmm0 // return hash + ret + } +} +#endif // _MSC_VER >= 1700 + +#endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER) + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/TMessagesProj/jni/libyuv/source/convert.cc b/TMessagesProj/jni/libyuv/source/convert.cc new file mode 100644 index 000000000..c31ecf263 --- /dev/null +++ b/TMessagesProj/jni/libyuv/source/convert.cc @@ -0,0 +1,1543 @@ +/* + * Copyright 2011 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/convert.h" + +#include "libyuv/basic_types.h" +#include "libyuv/cpu_id.h" +#include "libyuv/planar_functions.h" +#include "libyuv/rotate.h" +#include "libyuv/scale.h" // For ScalePlane() +#include "libyuv/row.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +#define SUBSAMPLE(v, a, s) (v < 0) ? (-((-v + a) >> s)) : ((v + a) >> s) +static __inline int Abs(int v) { + return v >= 0 ? v : -v; +} + +// Any I4xx To I420 format with mirroring. +static int I4xxToI420(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int src_y_width, int src_y_height, + int src_uv_width, int src_uv_height) { + const int dst_y_width = Abs(src_y_width); + const int dst_y_height = Abs(src_y_height); + const int dst_uv_width = SUBSAMPLE(dst_y_width, 1, 1); + const int dst_uv_height = SUBSAMPLE(dst_y_height, 1, 1); + if (src_y_width == 0 || src_y_height == 0 || + src_uv_width == 0 || src_uv_height == 0) { + return -1; + } + ScalePlane(src_y, src_stride_y, src_y_width, src_y_height, + dst_y, dst_stride_y, dst_y_width, dst_y_height, + kFilterBilinear); + ScalePlane(src_u, src_stride_u, src_uv_width, src_uv_height, + dst_u, dst_stride_u, dst_uv_width, dst_uv_height, + kFilterBilinear); + ScalePlane(src_v, src_stride_v, src_uv_width, src_uv_height, + dst_v, dst_stride_v, dst_uv_width, dst_uv_height, + kFilterBilinear); + return 0; +} + +// Copy I420 with optional flipping +// TODO(fbarchard): Use Scale plane which supports mirroring, but ensure +// is does row coalescing. +LIBYUV_API +int I420Copy(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height) { + int halfwidth = (width + 1) >> 1; + int halfheight = (height + 1) >> 1; + if (!src_y || !src_u || !src_v || + !dst_y || !dst_u || !dst_v || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + halfheight = (height + 1) >> 1; + src_y = src_y + (height - 1) * src_stride_y; + src_u = src_u + (halfheight - 1) * src_stride_u; + src_v = src_v + (halfheight - 1) * src_stride_v; + src_stride_y = -src_stride_y; + src_stride_u = -src_stride_u; + src_stride_v = -src_stride_v; + } + + if (dst_y) { + CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height); + } + // Copy UV planes. + CopyPlane(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, halfheight); + CopyPlane(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, halfheight); + return 0; +} + +// 422 chroma is 1/2 width, 1x height +// 420 chroma is 1/2 width, 1/2 height +LIBYUV_API +int I422ToI420(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height) { + const int src_uv_width = SUBSAMPLE(width, 1, 1); + return I4xxToI420(src_y, src_stride_y, + src_u, src_stride_u, + src_v, src_stride_v, + dst_y, dst_stride_y, + dst_u, dst_stride_u, + dst_v, dst_stride_v, + width, height, + src_uv_width, height); +} + +// 444 chroma is 1x width, 1x height +// 420 chroma is 1/2 width, 1/2 height +LIBYUV_API +int I444ToI420(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height) { + return I4xxToI420(src_y, src_stride_y, + src_u, src_stride_u, + src_v, src_stride_v, + dst_y, dst_stride_y, + dst_u, dst_stride_u, + dst_v, dst_stride_v, + width, height, + width, height); +} + +// 411 chroma is 1/4 width, 1x height +// 420 chroma is 1/2 width, 1/2 height +LIBYUV_API +int I411ToI420(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height) { + const int src_uv_width = SUBSAMPLE(width, 3, 2); + return I4xxToI420(src_y, src_stride_y, + src_u, src_stride_u, + src_v, src_stride_v, + dst_y, dst_stride_y, + dst_u, dst_stride_u, + dst_v, dst_stride_v, + width, height, + src_uv_width, height); +} + +// I400 is greyscale typically used in MJPG +LIBYUV_API +int I400ToI420(const uint8* src_y, int src_stride_y, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height) { + int halfwidth = (width + 1) >> 1; + int halfheight = (height + 1) >> 1; + if (!src_y || !dst_y || !dst_u || !dst_v || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + halfheight = (height + 1) >> 1; + src_y = src_y + (height - 1) * src_stride_y; + src_stride_y = -src_stride_y; + } + CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height); + SetPlane(dst_u, dst_stride_u, halfwidth, halfheight, 128); + SetPlane(dst_v, dst_stride_v, halfwidth, halfheight, 128); + return 0; +} + +static void CopyPlane2(const uint8* src, int src_stride_0, int src_stride_1, + uint8* dst, int dst_stride, + int width, int height) { + int y; + void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C; +#if defined(HAS_COPYROW_X86) + if (TestCpuFlag(kCpuHasX86) && IS_ALIGNED(width, 4)) { + CopyRow = CopyRow_X86; + } +#endif +#if defined(HAS_COPYROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32) && + IS_ALIGNED(src, 16) && + IS_ALIGNED(src_stride_0, 16) && IS_ALIGNED(src_stride_1, 16) && + IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) { + CopyRow = CopyRow_SSE2; + } +#endif +#if defined(HAS_COPYROW_ERMS) + if (TestCpuFlag(kCpuHasERMS)) { + CopyRow = CopyRow_ERMS; + } +#endif +#if defined(HAS_COPYROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 32)) { + CopyRow = CopyRow_NEON; + } +#endif +#if defined(HAS_COPYROW_MIPS) + if (TestCpuFlag(kCpuHasMIPS)) { + CopyRow = CopyRow_MIPS; + } +#endif + + // Copy plane + for (y = 0; y < height - 1; y += 2) { + CopyRow(src, dst, width); + CopyRow(src + src_stride_0, dst + dst_stride, width); + src += src_stride_0 + src_stride_1; + dst += dst_stride * 2; + } + if (height & 1) { + CopyRow(src, dst, width); + } +} + +// Support converting from FOURCC_M420 +// Useful for bandwidth constrained transports like USB 1.0 and 2.0 and for +// easy conversion to I420. +// M420 format description: +// M420 is row biplanar 420: 2 rows of Y and 1 row of UV. +// Chroma is half width / half height. (420) +// src_stride_m420 is row planar. Normally this will be the width in pixels. +// The UV plane is half width, but 2 values, so src_stride_m420 applies to +// this as well as the two Y planes. +static int X420ToI420(const uint8* src_y, + int src_stride_y0, int src_stride_y1, + const uint8* src_uv, int src_stride_uv, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height) { + int y; + int halfwidth = (width + 1) >> 1; + int halfheight = (height + 1) >> 1; + void (*SplitUVRow)(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) = + SplitUVRow_C; + if (!src_y || !src_uv || + !dst_y || !dst_u || !dst_v || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + halfheight = (height + 1) >> 1; + dst_y = dst_y + (height - 1) * dst_stride_y; + dst_u = dst_u + (halfheight - 1) * dst_stride_u; + dst_v = dst_v + (halfheight - 1) * dst_stride_v; + dst_stride_y = -dst_stride_y; + dst_stride_u = -dst_stride_u; + dst_stride_v = -dst_stride_v; + } + // Coalesce rows. + if (src_stride_y0 == width && + src_stride_y1 == width && + dst_stride_y == width) { + width *= height; + height = 1; + src_stride_y0 = src_stride_y1 = dst_stride_y = 0; + } + // Coalesce rows. + if (src_stride_uv == halfwidth * 2 && + dst_stride_u == halfwidth && + dst_stride_v == halfwidth) { + halfwidth *= halfheight; + halfheight = 1; + src_stride_uv = dst_stride_u = dst_stride_v = 0; + } +#if defined(HAS_SPLITUVROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && halfwidth >= 16) { + SplitUVRow = SplitUVRow_Any_SSE2; + if (IS_ALIGNED(halfwidth, 16)) { + SplitUVRow = SplitUVRow_Unaligned_SSE2; + if (IS_ALIGNED(src_uv, 16) && IS_ALIGNED(src_stride_uv, 16) && + IS_ALIGNED(dst_u, 16) && IS_ALIGNED(dst_stride_u, 16) && + IS_ALIGNED(dst_v, 16) && IS_ALIGNED(dst_stride_v, 16)) { + SplitUVRow = SplitUVRow_SSE2; + } + } + } +#endif +#if defined(HAS_SPLITUVROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2) && halfwidth >= 32) { + SplitUVRow = SplitUVRow_Any_AVX2; + if (IS_ALIGNED(halfwidth, 32)) { + SplitUVRow = SplitUVRow_AVX2; + } + } +#endif +#if defined(HAS_SPLITUVROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && halfwidth >= 16) { + SplitUVRow = SplitUVRow_Any_NEON; + if (IS_ALIGNED(halfwidth, 16)) { + SplitUVRow = SplitUVRow_NEON; + } + } +#endif +#if defined(HAS_SPLITUVROW_MIPS_DSPR2) + if (TestCpuFlag(kCpuHasMIPS_DSPR2) && halfwidth >= 16) { + SplitUVRow = SplitUVRow_Any_MIPS_DSPR2; + if (IS_ALIGNED(halfwidth, 16)) { + SplitUVRow = SplitUVRow_Unaligned_MIPS_DSPR2; + if (IS_ALIGNED(src_uv, 4) && IS_ALIGNED(src_stride_uv, 4) && + IS_ALIGNED(dst_u, 4) && IS_ALIGNED(dst_stride_u, 4) && + IS_ALIGNED(dst_v, 4) && IS_ALIGNED(dst_stride_v, 4)) { + SplitUVRow = SplitUVRow_MIPS_DSPR2; + } + } + } +#endif + + if (dst_y) { + if (src_stride_y0 == src_stride_y1) { + CopyPlane(src_y, src_stride_y0, dst_y, dst_stride_y, width, height); + } else { + CopyPlane2(src_y, src_stride_y0, src_stride_y1, dst_y, dst_stride_y, + width, height); + } + } + + for (y = 0; y < halfheight; ++y) { + // Copy a row of UV. + SplitUVRow(src_uv, dst_u, dst_v, halfwidth); + dst_u += dst_stride_u; + dst_v += dst_stride_v; + src_uv += src_stride_uv; + } + return 0; +} + +// Convert NV12 to I420. +LIBYUV_API +int NV12ToI420(const uint8* src_y, int src_stride_y, + const uint8* src_uv, int src_stride_uv, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height) { + return X420ToI420(src_y, src_stride_y, src_stride_y, + src_uv, src_stride_uv, + dst_y, dst_stride_y, + dst_u, dst_stride_u, + dst_v, dst_stride_v, + width, height); +} + +// Convert NV21 to I420. Same as NV12 but u and v pointers swapped. +LIBYUV_API +int NV21ToI420(const uint8* src_y, int src_stride_y, + const uint8* src_vu, int src_stride_vu, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height) { + return X420ToI420(src_y, src_stride_y, src_stride_y, + src_vu, src_stride_vu, + dst_y, dst_stride_y, + dst_v, dst_stride_v, + dst_u, dst_stride_u, + width, height); +} + +// Convert M420 to I420. +LIBYUV_API +int M420ToI420(const uint8* src_m420, int src_stride_m420, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height) { + return X420ToI420(src_m420, src_stride_m420, src_stride_m420 * 2, + src_m420 + src_stride_m420 * 2, src_stride_m420 * 3, + dst_y, dst_stride_y, + dst_u, dst_stride_u, + dst_v, dst_stride_v, + width, height); +} + +// Convert Q420 to I420. +// Format is rows of YY/YUYV +LIBYUV_API +int Q420ToI420(const uint8* src_y, int src_stride_y, + const uint8* src_yuy2, int src_stride_yuy2, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height) { + int y; + int halfheight; + void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C; + void (*YUY2ToUV422Row)(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v, + int pix) = YUY2ToUV422Row_C; + void (*YUY2ToYRow)(const uint8* src_yuy2, uint8* dst_y, int pix) = + YUY2ToYRow_C; + if (!src_y || !src_yuy2 || + !dst_y || !dst_u || !dst_v || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + halfheight = (height + 1) >> 1; + dst_y = dst_y + (height - 1) * dst_stride_y; + dst_u = dst_u + (halfheight - 1) * dst_stride_u; + dst_v = dst_v + (halfheight - 1) * dst_stride_v; + dst_stride_y = -dst_stride_y; + dst_stride_u = -dst_stride_u; + dst_stride_v = -dst_stride_v; + } + // CopyRow for rows of just Y in Q420 copied to Y plane of I420. +#if defined(HAS_COPYROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 32)) { + CopyRow = CopyRow_NEON; + } +#endif +#if defined(HAS_COPYROW_X86) + if (IS_ALIGNED(width, 4)) { + CopyRow = CopyRow_X86; + } +#endif +#if defined(HAS_COPYROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32) && + IS_ALIGNED(src_y, 16) && IS_ALIGNED(src_stride_y, 16) && + IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { + CopyRow = CopyRow_SSE2; + } +#endif +#if defined(HAS_COPYROW_ERMS) + if (TestCpuFlag(kCpuHasERMS)) { + CopyRow = CopyRow_ERMS; + } +#endif +#if defined(HAS_COPYROW_MIPS) + if (TestCpuFlag(kCpuHasMIPS)) { + CopyRow = CopyRow_MIPS; + } +#endif + +#if defined(HAS_YUY2TOYROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && width >= 16) { + YUY2ToUV422Row = YUY2ToUV422Row_Any_SSE2; + YUY2ToYRow = YUY2ToYRow_Any_SSE2; + if (IS_ALIGNED(width, 16)) { + YUY2ToUV422Row = YUY2ToUV422Row_Unaligned_SSE2; + YUY2ToYRow = YUY2ToYRow_Unaligned_SSE2; + if (IS_ALIGNED(src_yuy2, 16) && IS_ALIGNED(src_stride_yuy2, 16)) { + YUY2ToUV422Row = YUY2ToUV422Row_SSE2; + if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { + YUY2ToYRow = YUY2ToYRow_SSE2; + } + } + } + } +#endif +#if defined(HAS_YUY2TOYROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2) && width >= 32) { + YUY2ToUV422Row = YUY2ToUV422Row_Any_AVX2; + YUY2ToYRow = YUY2ToYRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + YUY2ToUV422Row = YUY2ToUV422Row_AVX2; + YUY2ToYRow = YUY2ToYRow_AVX2; + } + } +#endif +#if defined(HAS_YUY2TOYROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + YUY2ToYRow = YUY2ToYRow_Any_NEON; + if (width >= 16) { + YUY2ToUV422Row = YUY2ToUV422Row_Any_NEON; + } + if (IS_ALIGNED(width, 16)) { + YUY2ToYRow = YUY2ToYRow_NEON; + YUY2ToUV422Row = YUY2ToUV422Row_NEON; + } + } +#endif + + for (y = 0; y < height - 1; y += 2) { + CopyRow(src_y, dst_y, width); + src_y += src_stride_y; + dst_y += dst_stride_y; + + YUY2ToUV422Row(src_yuy2, dst_u, dst_v, width); + YUY2ToYRow(src_yuy2, dst_y, width); + src_yuy2 += src_stride_yuy2; + dst_y += dst_stride_y; + dst_u += dst_stride_u; + dst_v += dst_stride_v; + } + if (height & 1) { + CopyRow(src_y, dst_y, width); + YUY2ToUV422Row(src_yuy2, dst_u, dst_v, width); + } + return 0; +} + +// Convert YUY2 to I420. +LIBYUV_API +int YUY2ToI420(const uint8* src_yuy2, int src_stride_yuy2, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height) { + int y; + void (*YUY2ToUVRow)(const uint8* src_yuy2, int src_stride_yuy2, + uint8* dst_u, uint8* dst_v, int pix) = YUY2ToUVRow_C; + void (*YUY2ToYRow)(const uint8* src_yuy2, + uint8* dst_y, int pix) = YUY2ToYRow_C; + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_yuy2 = src_yuy2 + (height - 1) * src_stride_yuy2; + src_stride_yuy2 = -src_stride_yuy2; + } +#if defined(HAS_YUY2TOYROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && width >= 16) { + YUY2ToUVRow = YUY2ToUVRow_Any_SSE2; + YUY2ToYRow = YUY2ToYRow_Any_SSE2; + if (IS_ALIGNED(width, 16)) { + YUY2ToUVRow = YUY2ToUVRow_Unaligned_SSE2; + YUY2ToYRow = YUY2ToYRow_Unaligned_SSE2; + if (IS_ALIGNED(src_yuy2, 16) && IS_ALIGNED(src_stride_yuy2, 16)) { + YUY2ToUVRow = YUY2ToUVRow_SSE2; + if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { + YUY2ToYRow = YUY2ToYRow_SSE2; + } + } + } + } +#endif +#if defined(HAS_YUY2TOYROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2) && width >= 32) { + YUY2ToUVRow = YUY2ToUVRow_Any_AVX2; + YUY2ToYRow = YUY2ToYRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + YUY2ToUVRow = YUY2ToUVRow_AVX2; + YUY2ToYRow = YUY2ToYRow_AVX2; + } + } +#endif +#if defined(HAS_YUY2TOYROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + YUY2ToYRow = YUY2ToYRow_Any_NEON; + if (width >= 16) { + YUY2ToUVRow = YUY2ToUVRow_Any_NEON; + } + if (IS_ALIGNED(width, 16)) { + YUY2ToYRow = YUY2ToYRow_NEON; + YUY2ToUVRow = YUY2ToUVRow_NEON; + } + } +#endif + + for (y = 0; y < height - 1; y += 2) { + YUY2ToUVRow(src_yuy2, src_stride_yuy2, dst_u, dst_v, width); + YUY2ToYRow(src_yuy2, dst_y, width); + YUY2ToYRow(src_yuy2 + src_stride_yuy2, dst_y + dst_stride_y, width); + src_yuy2 += src_stride_yuy2 * 2; + dst_y += dst_stride_y * 2; + dst_u += dst_stride_u; + dst_v += dst_stride_v; + } + if (height & 1) { + YUY2ToUVRow(src_yuy2, 0, dst_u, dst_v, width); + YUY2ToYRow(src_yuy2, dst_y, width); + } + return 0; +} + +// Convert UYVY to I420. +LIBYUV_API +int UYVYToI420(const uint8* src_uyvy, int src_stride_uyvy, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height) { + int y; + void (*UYVYToUVRow)(const uint8* src_uyvy, int src_stride_uyvy, + uint8* dst_u, uint8* dst_v, int pix) = UYVYToUVRow_C; + void (*UYVYToYRow)(const uint8* src_uyvy, + uint8* dst_y, int pix) = UYVYToYRow_C; + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_uyvy = src_uyvy + (height - 1) * src_stride_uyvy; + src_stride_uyvy = -src_stride_uyvy; + } +#if defined(HAS_UYVYTOYROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && width >= 16) { + UYVYToUVRow = UYVYToUVRow_Any_SSE2; + UYVYToYRow = UYVYToYRow_Any_SSE2; + if (IS_ALIGNED(width, 16)) { + UYVYToUVRow = UYVYToUVRow_Unaligned_SSE2; + UYVYToYRow = UYVYToYRow_Unaligned_SSE2; + if (IS_ALIGNED(src_uyvy, 16) && IS_ALIGNED(src_stride_uyvy, 16)) { + UYVYToUVRow = UYVYToUVRow_SSE2; + if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { + UYVYToYRow = UYVYToYRow_SSE2; + } + } + } + } +#endif +#if defined(HAS_UYVYTOYROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2) && width >= 32) { + UYVYToUVRow = UYVYToUVRow_Any_AVX2; + UYVYToYRow = UYVYToYRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + UYVYToUVRow = UYVYToUVRow_AVX2; + UYVYToYRow = UYVYToYRow_AVX2; + } + } +#endif +#if defined(HAS_UYVYTOYROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + UYVYToYRow = UYVYToYRow_Any_NEON; + if (width >= 16) { + UYVYToUVRow = UYVYToUVRow_Any_NEON; + } + if (IS_ALIGNED(width, 16)) { + UYVYToYRow = UYVYToYRow_NEON; + UYVYToUVRow = UYVYToUVRow_NEON; + } + } +#endif + + for (y = 0; y < height - 1; y += 2) { + UYVYToUVRow(src_uyvy, src_stride_uyvy, dst_u, dst_v, width); + UYVYToYRow(src_uyvy, dst_y, width); + UYVYToYRow(src_uyvy + src_stride_uyvy, dst_y + dst_stride_y, width); + src_uyvy += src_stride_uyvy * 2; + dst_y += dst_stride_y * 2; + dst_u += dst_stride_u; + dst_v += dst_stride_v; + } + if (height & 1) { + UYVYToUVRow(src_uyvy, 0, dst_u, dst_v, width); + UYVYToYRow(src_uyvy, dst_y, width); + } + return 0; +} + +// Convert ARGB to I420. +LIBYUV_API +int ARGBToI420(const uint8* src_argb, int src_stride_argb, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height) { + int y; + void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C; + void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) = + ARGBToYRow_C; + if (!src_argb || + !dst_y || !dst_u || !dst_v || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } +#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { + ARGBToUVRow = ARGBToUVRow_Any_SSSE3; + ARGBToYRow = ARGBToYRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToUVRow = ARGBToUVRow_Unaligned_SSSE3; + ARGBToYRow = ARGBToYRow_Unaligned_SSSE3; + if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) { + ARGBToUVRow = ARGBToUVRow_SSSE3; + if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { + ARGBToYRow = ARGBToYRow_SSSE3; + } + } + } + } +#endif +#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2) && width >= 32) { + ARGBToUVRow = ARGBToUVRow_Any_AVX2; + ARGBToYRow = ARGBToYRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ARGBToUVRow = ARGBToUVRow_AVX2; + ARGBToYRow = ARGBToYRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBTOYROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + ARGBToYRow = ARGBToYRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGBToYRow = ARGBToYRow_NEON; + } + } +#endif +#if defined(HAS_ARGBTOUVROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 16) { + ARGBToUVRow = ARGBToUVRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + ARGBToUVRow = ARGBToUVRow_NEON; + } + } +#endif + + for (y = 0; y < height - 1; y += 2) { + ARGBToUVRow(src_argb, src_stride_argb, dst_u, dst_v, width); + ARGBToYRow(src_argb, dst_y, width); + ARGBToYRow(src_argb + src_stride_argb, dst_y + dst_stride_y, width); + src_argb += src_stride_argb * 2; + dst_y += dst_stride_y * 2; + dst_u += dst_stride_u; + dst_v += dst_stride_v; + } + if (height & 1) { + ARGBToUVRow(src_argb, 0, dst_u, dst_v, width); + ARGBToYRow(src_argb, dst_y, width); + } + return 0; +} + +// Convert BGRA to I420. +LIBYUV_API +int BGRAToI420(const uint8* src_bgra, int src_stride_bgra, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height) { + int y; + void (*BGRAToUVRow)(const uint8* src_bgra0, int src_stride_bgra, + uint8* dst_u, uint8* dst_v, int width) = BGRAToUVRow_C; + void (*BGRAToYRow)(const uint8* src_bgra, uint8* dst_y, int pix) = + BGRAToYRow_C; + if (!src_bgra || + !dst_y || !dst_u || !dst_v || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_bgra = src_bgra + (height - 1) * src_stride_bgra; + src_stride_bgra = -src_stride_bgra; + } +#if defined(HAS_BGRATOYROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { + BGRAToUVRow = BGRAToUVRow_Any_SSSE3; + BGRAToYRow = BGRAToYRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + BGRAToUVRow = BGRAToUVRow_Unaligned_SSSE3; + BGRAToYRow = BGRAToYRow_Unaligned_SSSE3; + if (IS_ALIGNED(src_bgra, 16) && IS_ALIGNED(src_stride_bgra, 16)) { + BGRAToUVRow = BGRAToUVRow_SSSE3; + if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { + BGRAToYRow = BGRAToYRow_SSSE3; + } + } + } + } +#elif defined(HAS_BGRATOYROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + BGRAToYRow = BGRAToYRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + BGRAToYRow = BGRAToYRow_NEON; + } + } +#endif +#if defined(HAS_BGRATOUVROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 16) { + BGRAToUVRow = BGRAToUVRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + BGRAToUVRow = BGRAToUVRow_NEON; + } + } +#endif + + for (y = 0; y < height - 1; y += 2) { + BGRAToUVRow(src_bgra, src_stride_bgra, dst_u, dst_v, width); + BGRAToYRow(src_bgra, dst_y, width); + BGRAToYRow(src_bgra + src_stride_bgra, dst_y + dst_stride_y, width); + src_bgra += src_stride_bgra * 2; + dst_y += dst_stride_y * 2; + dst_u += dst_stride_u; + dst_v += dst_stride_v; + } + if (height & 1) { + BGRAToUVRow(src_bgra, 0, dst_u, dst_v, width); + BGRAToYRow(src_bgra, dst_y, width); + } + return 0; +} + +// Convert ABGR to I420. +LIBYUV_API +int ABGRToI420(const uint8* src_abgr, int src_stride_abgr, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height) { + int y; + void (*ABGRToUVRow)(const uint8* src_abgr0, int src_stride_abgr, + uint8* dst_u, uint8* dst_v, int width) = ABGRToUVRow_C; + void (*ABGRToYRow)(const uint8* src_abgr, uint8* dst_y, int pix) = + ABGRToYRow_C; + if (!src_abgr || + !dst_y || !dst_u || !dst_v || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_abgr = src_abgr + (height - 1) * src_stride_abgr; + src_stride_abgr = -src_stride_abgr; + } +#if defined(HAS_ABGRTOYROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { + ABGRToUVRow = ABGRToUVRow_Any_SSSE3; + ABGRToYRow = ABGRToYRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ABGRToUVRow = ABGRToUVRow_Unaligned_SSSE3; + ABGRToYRow = ABGRToYRow_Unaligned_SSSE3; + if (IS_ALIGNED(src_abgr, 16) && IS_ALIGNED(src_stride_abgr, 16)) { + ABGRToUVRow = ABGRToUVRow_SSSE3; + if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { + ABGRToYRow = ABGRToYRow_SSSE3; + } + } + } + } +#elif defined(HAS_ABGRTOYROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + ABGRToYRow = ABGRToYRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ABGRToYRow = ABGRToYRow_NEON; + } + } +#endif +#if defined(HAS_ABGRTOUVROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 16) { + ABGRToUVRow = ABGRToUVRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + ABGRToUVRow = ABGRToUVRow_NEON; + } + } +#endif + + for (y = 0; y < height - 1; y += 2) { + ABGRToUVRow(src_abgr, src_stride_abgr, dst_u, dst_v, width); + ABGRToYRow(src_abgr, dst_y, width); + ABGRToYRow(src_abgr + src_stride_abgr, dst_y + dst_stride_y, width); + src_abgr += src_stride_abgr * 2; + dst_y += dst_stride_y * 2; + dst_u += dst_stride_u; + dst_v += dst_stride_v; + } + if (height & 1) { + ABGRToUVRow(src_abgr, 0, dst_u, dst_v, width); + ABGRToYRow(src_abgr, dst_y, width); + } + return 0; +} + +// Convert RGBA to I420. +LIBYUV_API +int RGBAToI420(const uint8* src_rgba, int src_stride_rgba, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height) { + int y; + void (*RGBAToUVRow)(const uint8* src_rgba0, int src_stride_rgba, + uint8* dst_u, uint8* dst_v, int width) = RGBAToUVRow_C; + void (*RGBAToYRow)(const uint8* src_rgba, uint8* dst_y, int pix) = + RGBAToYRow_C; + if (!src_rgba || + !dst_y || !dst_u || !dst_v || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_rgba = src_rgba + (height - 1) * src_stride_rgba; + src_stride_rgba = -src_stride_rgba; + } +#if defined(HAS_RGBATOYROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { + RGBAToUVRow = RGBAToUVRow_Any_SSSE3; + RGBAToYRow = RGBAToYRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + RGBAToUVRow = RGBAToUVRow_Unaligned_SSSE3; + RGBAToYRow = RGBAToYRow_Unaligned_SSSE3; + if (IS_ALIGNED(src_rgba, 16) && IS_ALIGNED(src_stride_rgba, 16)) { + RGBAToUVRow = RGBAToUVRow_SSSE3; + if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { + RGBAToYRow = RGBAToYRow_SSSE3; + } + } + } + } +#elif defined(HAS_RGBATOYROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + RGBAToYRow = RGBAToYRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + RGBAToYRow = RGBAToYRow_NEON; + } + } +#endif +#if defined(HAS_RGBATOUVROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 16) { + RGBAToUVRow = RGBAToUVRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + RGBAToUVRow = RGBAToUVRow_NEON; + } + } +#endif + + for (y = 0; y < height - 1; y += 2) { + RGBAToUVRow(src_rgba, src_stride_rgba, dst_u, dst_v, width); + RGBAToYRow(src_rgba, dst_y, width); + RGBAToYRow(src_rgba + src_stride_rgba, dst_y + dst_stride_y, width); + src_rgba += src_stride_rgba * 2; + dst_y += dst_stride_y * 2; + dst_u += dst_stride_u; + dst_v += dst_stride_v; + } + if (height & 1) { + RGBAToUVRow(src_rgba, 0, dst_u, dst_v, width); + RGBAToYRow(src_rgba, dst_y, width); + } + return 0; +} + +// Convert RGB24 to I420. +LIBYUV_API +int RGB24ToI420(const uint8* src_rgb24, int src_stride_rgb24, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height) { + int y; +#if defined(HAS_RGB24TOYROW_NEON) + void (*RGB24ToUVRow)(const uint8* src_rgb24, int src_stride_rgb24, + uint8* dst_u, uint8* dst_v, int width) = RGB24ToUVRow_C; + void (*RGB24ToYRow)(const uint8* src_rgb24, uint8* dst_y, int pix) = + RGB24ToYRow_C; +#else + void (*RGB24ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix) = + RGB24ToARGBRow_C; + void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C; + void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) = + ARGBToYRow_C; +#endif + if (!src_rgb24 || !dst_y || !dst_u || !dst_v || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_rgb24 = src_rgb24 + (height - 1) * src_stride_rgb24; + src_stride_rgb24 = -src_stride_rgb24; + } + +#if defined(HAS_RGB24TOYROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + RGB24ToYRow = RGB24ToYRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + RGB24ToYRow = RGB24ToYRow_NEON; + } + } +#endif +#if defined(HAS_RGB24TOUVROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 16) { + RGB24ToUVRow = RGB24ToUVRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + RGB24ToUVRow = RGB24ToUVRow_NEON; + } + } +#endif +#if defined(HAS_RGB24TOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { + RGB24ToARGBRow = RGB24ToARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + RGB24ToARGBRow = RGB24ToARGBRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBTOUVROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { + ARGBToUVRow = ARGBToUVRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToUVRow = ARGBToUVRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBTOUVROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { + ARGBToYRow = ARGBToYRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToYRow = ARGBToYRow_Unaligned_SSSE3; + if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { + ARGBToYRow = ARGBToYRow_SSSE3; + } + } + } +#endif // HAS_ARGBTOUVROW_SSSE3 + + { +#if !defined(HAS_RGB24TOYROW_NEON) + // Allocate 2 rows of ARGB. + const int kRowSize = (width * 4 + 15) & ~15; + align_buffer_64(row, kRowSize * 2); +#endif + + for (y = 0; y < height - 1; y += 2) { +#if defined(HAS_RGB24TOYROW_NEON) + RGB24ToUVRow(src_rgb24, src_stride_rgb24, dst_u, dst_v, width); + RGB24ToYRow(src_rgb24, dst_y, width); + RGB24ToYRow(src_rgb24 + src_stride_rgb24, dst_y + dst_stride_y, width); +#else + RGB24ToARGBRow(src_rgb24, row, width); + RGB24ToARGBRow(src_rgb24 + src_stride_rgb24, row + kRowSize, width); + ARGBToUVRow(row, kRowSize, dst_u, dst_v, width); + ARGBToYRow(row, dst_y, width); + ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width); +#endif + src_rgb24 += src_stride_rgb24 * 2; + dst_y += dst_stride_y * 2; + dst_u += dst_stride_u; + dst_v += dst_stride_v; + } + if (height & 1) { +#if defined(HAS_RGB24TOYROW_NEON) + RGB24ToUVRow(src_rgb24, 0, dst_u, dst_v, width); + RGB24ToYRow(src_rgb24, dst_y, width); +#else + RGB24ToARGBRow(src_rgb24, row, width); + ARGBToUVRow(row, 0, dst_u, dst_v, width); + ARGBToYRow(row, dst_y, width); +#endif + } +#if !defined(HAS_RGB24TOYROW_NEON) + free_aligned_buffer_64(row); +#endif + } + return 0; +} + +// Convert RAW to I420. +LIBYUV_API +int RAWToI420(const uint8* src_raw, int src_stride_raw, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height) { + int y; +#if defined(HAS_RAWTOYROW_NEON) + void (*RAWToUVRow)(const uint8* src_raw, int src_stride_raw, + uint8* dst_u, uint8* dst_v, int width) = RAWToUVRow_C; + void (*RAWToYRow)(const uint8* src_raw, uint8* dst_y, int pix) = + RAWToYRow_C; +#else + void (*RAWToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix) = + RAWToARGBRow_C; + void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C; + void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) = + ARGBToYRow_C; +#endif + if (!src_raw || !dst_y || !dst_u || !dst_v || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_raw = src_raw + (height - 1) * src_stride_raw; + src_stride_raw = -src_stride_raw; + } + +#if defined(HAS_RAWTOYROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + RAWToYRow = RAWToYRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + RAWToYRow = RAWToYRow_NEON; + } + } +#endif +#if defined(HAS_RAWTOUVROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 16) { + RAWToUVRow = RAWToUVRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + RAWToUVRow = RAWToUVRow_NEON; + } + } +#endif +#if defined(HAS_RAWTOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { + RAWToARGBRow = RAWToARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + RAWToARGBRow = RAWToARGBRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBTOUVROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { + ARGBToUVRow = ARGBToUVRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToUVRow = ARGBToUVRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBTOUVROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { + ARGBToYRow = ARGBToYRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToYRow = ARGBToYRow_Unaligned_SSSE3; + if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { + ARGBToYRow = ARGBToYRow_SSSE3; + } + } + } +#endif // HAS_ARGBTOUVROW_SSSE3 + + { + // Allocate 2 rows of ARGB. + const int kRowSize = (width * 4 + 15) & ~15; + align_buffer_64(row, kRowSize * 2); + + for (y = 0; y < height - 1; y += 2) { + #if defined(HAS_RAWTOYROW_NEON) + RAWToUVRow(src_raw, src_stride_raw, dst_u, dst_v, width); + RAWToYRow(src_raw, dst_y, width); + RAWToYRow(src_raw + src_stride_raw, dst_y + dst_stride_y, width); + #else + RAWToARGBRow(src_raw, row, width); + RAWToARGBRow(src_raw + src_stride_raw, row + kRowSize, width); + ARGBToUVRow(row, kRowSize, dst_u, dst_v, width); + ARGBToYRow(row, dst_y, width); + ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width); + #endif + src_raw += src_stride_raw * 2; + dst_y += dst_stride_y * 2; + dst_u += dst_stride_u; + dst_v += dst_stride_v; + } + if (height & 1) { + #if defined(HAS_RAWTOYROW_NEON) + RAWToUVRow(src_raw, 0, dst_u, dst_v, width); + RAWToYRow(src_raw, dst_y, width); + #else + RAWToARGBRow(src_raw, row, width); + ARGBToUVRow(row, 0, dst_u, dst_v, width); + ARGBToYRow(row, dst_y, width); + #endif + } + #if !defined(HAS_RAWTOYROW_NEON) + free_aligned_buffer_64(row); + #endif + } + return 0; +} + +// Convert RGB565 to I420. +LIBYUV_API +int RGB565ToI420(const uint8* src_rgb565, int src_stride_rgb565, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height) { + int y; +#if defined(HAS_RGB565TOYROW_NEON) + void (*RGB565ToUVRow)(const uint8* src_rgb565, int src_stride_rgb565, + uint8* dst_u, uint8* dst_v, int width) = RGB565ToUVRow_C; + void (*RGB565ToYRow)(const uint8* src_rgb565, uint8* dst_y, int pix) = + RGB565ToYRow_C; +#else + void (*RGB565ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix) = + RGB565ToARGBRow_C; + void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C; + void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) = + ARGBToYRow_C; +#endif + if (!src_rgb565 || !dst_y || !dst_u || !dst_v || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_rgb565 = src_rgb565 + (height - 1) * src_stride_rgb565; + src_stride_rgb565 = -src_stride_rgb565; + } + +#if defined(HAS_RGB565TOYROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + RGB565ToYRow = RGB565ToYRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + RGB565ToYRow = RGB565ToYRow_NEON; + } + if (width >= 16) { + RGB565ToUVRow = RGB565ToUVRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + RGB565ToUVRow = RGB565ToUVRow_NEON; + } + } + } +#else // HAS_RGB565TOYROW_NEON + +#if defined(HAS_RGB565TOARGBROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && width >= 8) { + RGB565ToARGBRow = RGB565ToARGBRow_Any_SSE2; + if (IS_ALIGNED(width, 8)) { + RGB565ToARGBRow = RGB565ToARGBRow_SSE2; + } + } +#endif +#if defined(HAS_ARGBTOUVROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { + ARGBToUVRow = ARGBToUVRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToUVRow = ARGBToUVRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBTOUVROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { + ARGBToYRow = ARGBToYRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToYRow = ARGBToYRow_Unaligned_SSSE3; + if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { + ARGBToYRow = ARGBToYRow_SSSE3; + } + } + } +#endif // HAS_ARGBTOUVROW_SSSE3 +#endif // HAS_RGB565TOYROW_NEON + + { +#if !defined(HAS_RGB565TOYROW_NEON) + // Allocate 2 rows of ARGB. + const int kRowSize = (width * 4 + 15) & ~15; + align_buffer_64(row, kRowSize * 2); +#endif + + for (y = 0; y < height - 1; y += 2) { +#if defined(HAS_RGB565TOYROW_NEON) + RGB565ToUVRow(src_rgb565, src_stride_rgb565, dst_u, dst_v, width); + RGB565ToYRow(src_rgb565, dst_y, width); + RGB565ToYRow(src_rgb565 + src_stride_rgb565, dst_y + dst_stride_y, width); +#else + RGB565ToARGBRow(src_rgb565, row, width); + RGB565ToARGBRow(src_rgb565 + src_stride_rgb565, row + kRowSize, width); + ARGBToUVRow(row, kRowSize, dst_u, dst_v, width); + ARGBToYRow(row, dst_y, width); + ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width); +#endif + src_rgb565 += src_stride_rgb565 * 2; + dst_y += dst_stride_y * 2; + dst_u += dst_stride_u; + dst_v += dst_stride_v; + } + if (height & 1) { +#if defined(HAS_RGB565TOYROW_NEON) + RGB565ToUVRow(src_rgb565, 0, dst_u, dst_v, width); + RGB565ToYRow(src_rgb565, dst_y, width); +#else + RGB565ToARGBRow(src_rgb565, row, width); + ARGBToUVRow(row, 0, dst_u, dst_v, width); + ARGBToYRow(row, dst_y, width); +#endif + } +#if !defined(HAS_RGB565TOYROW_NEON) + free_aligned_buffer_64(row); +#endif + } + return 0; +} + +// Convert ARGB1555 to I420. +LIBYUV_API +int ARGB1555ToI420(const uint8* src_argb1555, int src_stride_argb1555, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height) { + int y; +#if defined(HAS_ARGB1555TOYROW_NEON) + void (*ARGB1555ToUVRow)(const uint8* src_argb1555, int src_stride_argb1555, + uint8* dst_u, uint8* dst_v, int width) = ARGB1555ToUVRow_C; + void (*ARGB1555ToYRow)(const uint8* src_argb1555, uint8* dst_y, int pix) = + ARGB1555ToYRow_C; +#else + void (*ARGB1555ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix) = + ARGB1555ToARGBRow_C; + void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C; + void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) = + ARGBToYRow_C; +#endif + if (!src_argb1555 || !dst_y || !dst_u || !dst_v || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_argb1555 = src_argb1555 + (height - 1) * src_stride_argb1555; + src_stride_argb1555 = -src_stride_argb1555; + } + +#if defined(HAS_ARGB1555TOYROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + ARGB1555ToYRow = ARGB1555ToYRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGB1555ToYRow = ARGB1555ToYRow_NEON; + } + if (width >= 16) { + ARGB1555ToUVRow = ARGB1555ToUVRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + ARGB1555ToUVRow = ARGB1555ToUVRow_NEON; + } + } + } +#else // HAS_ARGB1555TOYROW_NEON + +#if defined(HAS_ARGB1555TOARGBROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && width >= 8) { + ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_SSE2; + if (IS_ALIGNED(width, 8)) { + ARGB1555ToARGBRow = ARGB1555ToARGBRow_SSE2; + } + } +#endif +#if defined(HAS_ARGBTOUVROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { + ARGBToUVRow = ARGBToUVRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToUVRow = ARGBToUVRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBTOUVROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { + ARGBToYRow = ARGBToYRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToYRow = ARGBToYRow_Unaligned_SSSE3; + if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { + ARGBToYRow = ARGBToYRow_SSSE3; + } + } + } +#endif // HAS_ARGBTOUVROW_SSSE3 +#endif // HAS_ARGB1555TOYROW_NEON + + { +#if !defined(HAS_ARGB1555TOYROW_NEON) + // Allocate 2 rows of ARGB. + const int kRowSize = (width * 4 + 15) & ~15; + align_buffer_64(row, kRowSize * 2); +#endif + for (y = 0; y < height - 1; y += 2) { +#if defined(HAS_ARGB1555TOYROW_NEON) + ARGB1555ToUVRow(src_argb1555, src_stride_argb1555, dst_u, dst_v, width); + ARGB1555ToYRow(src_argb1555, dst_y, width); + ARGB1555ToYRow(src_argb1555 + src_stride_argb1555, dst_y + dst_stride_y, + width); +#else + ARGB1555ToARGBRow(src_argb1555, row, width); + ARGB1555ToARGBRow(src_argb1555 + src_stride_argb1555, row + kRowSize, + width); + ARGBToUVRow(row, kRowSize, dst_u, dst_v, width); + ARGBToYRow(row, dst_y, width); + ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width); +#endif + src_argb1555 += src_stride_argb1555 * 2; + dst_y += dst_stride_y * 2; + dst_u += dst_stride_u; + dst_v += dst_stride_v; + } + if (height & 1) { +#if defined(HAS_ARGB1555TOYROW_NEON) + ARGB1555ToUVRow(src_argb1555, 0, dst_u, dst_v, width); + ARGB1555ToYRow(src_argb1555, dst_y, width); +#else + ARGB1555ToARGBRow(src_argb1555, row, width); + ARGBToUVRow(row, 0, dst_u, dst_v, width); + ARGBToYRow(row, dst_y, width); +#endif + } +#if !defined(HAS_ARGB1555TOYROW_NEON) + free_aligned_buffer_64(row); +#endif + } + return 0; +} + +// Convert ARGB4444 to I420. +LIBYUV_API +int ARGB4444ToI420(const uint8* src_argb4444, int src_stride_argb4444, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height) { + int y; +#if defined(HAS_ARGB4444TOYROW_NEON) + void (*ARGB4444ToUVRow)(const uint8* src_argb4444, int src_stride_argb4444, + uint8* dst_u, uint8* dst_v, int width) = ARGB4444ToUVRow_C; + void (*ARGB4444ToYRow)(const uint8* src_argb4444, uint8* dst_y, int pix) = + ARGB4444ToYRow_C; +#else + void (*ARGB4444ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix) = + ARGB4444ToARGBRow_C; + void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C; + void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) = + ARGBToYRow_C; +#endif + if (!src_argb4444 || !dst_y || !dst_u || !dst_v || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_argb4444 = src_argb4444 + (height - 1) * src_stride_argb4444; + src_stride_argb4444 = -src_stride_argb4444; + } + +#if defined(HAS_ARGB4444TOYROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + ARGB4444ToYRow = ARGB4444ToYRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGB4444ToYRow = ARGB4444ToYRow_NEON; + } + if (width >= 16) { + ARGB4444ToUVRow = ARGB4444ToUVRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + ARGB4444ToUVRow = ARGB4444ToUVRow_NEON; + } + } + } +#else // HAS_ARGB4444TOYROW_NEON + +#if defined(HAS_ARGB4444TOARGBROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && width >= 8) { + ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_SSE2; + if (IS_ALIGNED(width, 8)) { + ARGB4444ToARGBRow = ARGB4444ToARGBRow_SSE2; + } + } +#endif +#if defined(HAS_ARGBTOUVROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { + ARGBToUVRow = ARGBToUVRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToUVRow = ARGBToUVRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBTOUVROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { + ARGBToYRow = ARGBToYRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToYRow = ARGBToYRow_Unaligned_SSSE3; + if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { + ARGBToYRow = ARGBToYRow_SSSE3; + } + } + } +#endif // HAS_ARGBTOUVROW_SSSE3 +#endif // HAS_ARGB4444TOYROW_NEON + + { +#if !defined(HAS_ARGB4444TOYROW_NEON) + // Allocate 2 rows of ARGB. + const int kRowSize = (width * 4 + 15) & ~15; + align_buffer_64(row, kRowSize * 2); +#endif + + for (y = 0; y < height - 1; y += 2) { +#if defined(HAS_ARGB4444TOYROW_NEON) + ARGB4444ToUVRow(src_argb4444, src_stride_argb4444, dst_u, dst_v, width); + ARGB4444ToYRow(src_argb4444, dst_y, width); + ARGB4444ToYRow(src_argb4444 + src_stride_argb4444, dst_y + dst_stride_y, + width); +#else + ARGB4444ToARGBRow(src_argb4444, row, width); + ARGB4444ToARGBRow(src_argb4444 + src_stride_argb4444, row + kRowSize, + width); + ARGBToUVRow(row, kRowSize, dst_u, dst_v, width); + ARGBToYRow(row, dst_y, width); + ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width); +#endif + src_argb4444 += src_stride_argb4444 * 2; + dst_y += dst_stride_y * 2; + dst_u += dst_stride_u; + dst_v += dst_stride_v; + } + if (height & 1) { +#if defined(HAS_ARGB4444TOYROW_NEON) + ARGB4444ToUVRow(src_argb4444, 0, dst_u, dst_v, width); + ARGB4444ToYRow(src_argb4444, dst_y, width); +#else + ARGB4444ToARGBRow(src_argb4444, row, width); + ARGBToUVRow(row, 0, dst_u, dst_v, width); + ARGBToYRow(row, dst_y, width); +#endif + } +#if !defined(HAS_ARGB4444TOYROW_NEON) + free_aligned_buffer_64(row); +#endif + } + return 0; +} + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/TMessagesProj/jni/libyuv/source/convert_argb.cc b/TMessagesProj/jni/libyuv/source/convert_argb.cc new file mode 100644 index 000000000..ac0bc3d15 --- /dev/null +++ b/TMessagesProj/jni/libyuv/source/convert_argb.cc @@ -0,0 +1,938 @@ +/* + * Copyright 2011 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/convert_argb.h" + +#include "libyuv/cpu_id.h" +#include "libyuv/format_conversion.h" +#ifdef HAVE_JPEG +#include "libyuv/mjpeg_decoder.h" +#endif +#include "libyuv/rotate_argb.h" +#include "libyuv/row.h" +#include "libyuv/video_common.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// Copy ARGB with optional flipping +LIBYUV_API +int ARGBCopy(const uint8* src_argb, int src_stride_argb, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { + if (!src_argb || !dst_argb || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } + + CopyPlane(src_argb, src_stride_argb, dst_argb, dst_stride_argb, + width * 4, height); + return 0; +} + +// Convert I444 to ARGB. +LIBYUV_API +int I444ToARGB(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { + int y; + void (*I444ToARGBRow)(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width) = I444ToARGBRow_C; + if (!src_y || !src_u || !src_v || + !dst_argb || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_argb = dst_argb + (height - 1) * dst_stride_argb; + dst_stride_argb = -dst_stride_argb; + } + // Coalesce rows. + if (src_stride_y == width && + src_stride_u == width && + src_stride_v == width && + dst_stride_argb == width * 4) { + width *= height; + height = 1; + src_stride_y = src_stride_u = src_stride_v = dst_stride_argb = 0; + } +#if defined(HAS_I444TOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { + I444ToARGBRow = I444ToARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I444ToARGBRow = I444ToARGBRow_Unaligned_SSSE3; + if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + I444ToARGBRow = I444ToARGBRow_SSSE3; + } + } + } +#elif defined(HAS_I444TOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + I444ToARGBRow = I444ToARGBRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + I444ToARGBRow = I444ToARGBRow_NEON; + } + } +#endif + + for (y = 0; y < height; ++y) { + I444ToARGBRow(src_y, src_u, src_v, dst_argb, width); + dst_argb += dst_stride_argb; + src_y += src_stride_y; + src_u += src_stride_u; + src_v += src_stride_v; + } + return 0; +} + +// Convert I422 to ARGB. +LIBYUV_API +int I422ToARGB(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { + int y; + void (*I422ToARGBRow)(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width) = I422ToARGBRow_C; + if (!src_y || !src_u || !src_v || + !dst_argb || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_argb = dst_argb + (height - 1) * dst_stride_argb; + dst_stride_argb = -dst_stride_argb; + } + // Coalesce rows. + if (src_stride_y == width && + src_stride_u * 2 == width && + src_stride_v * 2 == width && + dst_stride_argb == width * 4) { + width *= height; + height = 1; + src_stride_y = src_stride_u = src_stride_v = dst_stride_argb = 0; + } +#if defined(HAS_I422TOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { + I422ToARGBRow = I422ToARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I422ToARGBRow = I422ToARGBRow_Unaligned_SSSE3; + if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + I422ToARGBRow = I422ToARGBRow_SSSE3; + } + } + } +#endif +#if defined(HAS_I422TOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2) && width >= 16) { + I422ToARGBRow = I422ToARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + I422ToARGBRow = I422ToARGBRow_AVX2; + } + } +#endif +#if defined(HAS_I422TOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + I422ToARGBRow = I422ToARGBRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + I422ToARGBRow = I422ToARGBRow_NEON; + } + } +#endif +#if defined(HAS_I422TOARGBROW_MIPS_DSPR2) + if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) && + IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) && + IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) && + IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) && + IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) { + I422ToARGBRow = I422ToARGBRow_MIPS_DSPR2; + } +#endif + + for (y = 0; y < height; ++y) { + I422ToARGBRow(src_y, src_u, src_v, dst_argb, width); + dst_argb += dst_stride_argb; + src_y += src_stride_y; + src_u += src_stride_u; + src_v += src_stride_v; + } + return 0; +} + +// Convert I411 to ARGB. +LIBYUV_API +int I411ToARGB(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { + int y; + void (*I411ToARGBRow)(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width) = I411ToARGBRow_C; + if (!src_y || !src_u || !src_v || + !dst_argb || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_argb = dst_argb + (height - 1) * dst_stride_argb; + dst_stride_argb = -dst_stride_argb; + } + // Coalesce rows. + if (src_stride_y == width && + src_stride_u * 4 == width && + src_stride_v * 4 == width && + dst_stride_argb == width * 4) { + width *= height; + height = 1; + src_stride_y = src_stride_u = src_stride_v = dst_stride_argb = 0; + } +#if defined(HAS_I411TOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { + I411ToARGBRow = I411ToARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I411ToARGBRow = I411ToARGBRow_Unaligned_SSSE3; + if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + I411ToARGBRow = I411ToARGBRow_SSSE3; + } + } + } +#elif defined(HAS_I411TOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + I411ToARGBRow = I411ToARGBRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + I411ToARGBRow = I411ToARGBRow_NEON; + } + } +#endif + + for (y = 0; y < height; ++y) { + I411ToARGBRow(src_y, src_u, src_v, dst_argb, width); + dst_argb += dst_stride_argb; + src_y += src_stride_y; + src_u += src_stride_u; + src_v += src_stride_v; + } + return 0; +} + +// Convert I400 to ARGB. +LIBYUV_API +int I400ToARGB_Reference(const uint8* src_y, int src_stride_y, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { + int y; + void (*YToARGBRow)(const uint8* y_buf, + uint8* rgb_buf, + int width) = YToARGBRow_C; + if (!src_y || !dst_argb || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_argb = dst_argb + (height - 1) * dst_stride_argb; + dst_stride_argb = -dst_stride_argb; + } + // Coalesce rows. + if (src_stride_y == width && + dst_stride_argb == width * 4) { + width *= height; + height = 1; + src_stride_y = dst_stride_argb = 0; + } +#if defined(HAS_YTOARGBROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && width >= 8 && + IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + YToARGBRow = YToARGBRow_Any_SSE2; + if (IS_ALIGNED(width, 8)) { + YToARGBRow = YToARGBRow_SSE2; + } + } +#elif defined(HAS_YTOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + YToARGBRow = YToARGBRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + YToARGBRow = YToARGBRow_NEON; + } + } +#endif + + for (y = 0; y < height; ++y) { + YToARGBRow(src_y, dst_argb, width); + dst_argb += dst_stride_argb; + src_y += src_stride_y; + } + return 0; +} + +// Convert I400 to ARGB. +LIBYUV_API +int I400ToARGB(const uint8* src_y, int src_stride_y, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { + int y; + void (*I400ToARGBRow)(const uint8* src_y, uint8* dst_argb, int pix) = + I400ToARGBRow_C; + if (!src_y || !dst_argb || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_y = src_y + (height - 1) * src_stride_y; + src_stride_y = -src_stride_y; + } + // Coalesce rows. + if (src_stride_y == width && + dst_stride_argb == width * 4) { + width *= height; + height = 1; + src_stride_y = dst_stride_argb = 0; + } +#if defined(HAS_I400TOARGBROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && width >= 8) { + I400ToARGBRow = I400ToARGBRow_Any_SSE2; + if (IS_ALIGNED(width, 8)) { + I400ToARGBRow = I400ToARGBRow_Unaligned_SSE2; + if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + I400ToARGBRow = I400ToARGBRow_SSE2; + } + } + } +#elif defined(HAS_I400TOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + I400ToARGBRow = I400ToARGBRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + I400ToARGBRow = I400ToARGBRow_NEON; + } + } +#endif + for (y = 0; y < height; ++y) { + I400ToARGBRow(src_y, dst_argb, width); + src_y += src_stride_y; + dst_argb += dst_stride_argb; + } + return 0; +} + +// Shuffle table for converting BGRA to ARGB. +static uvec8 kShuffleMaskBGRAToARGB = { + 3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u +}; + +// Shuffle table for converting ABGR to ARGB. +static uvec8 kShuffleMaskABGRToARGB = { + 2u, 1u, 0u, 3u, 6u, 5u, 4u, 7u, 10u, 9u, 8u, 11u, 14u, 13u, 12u, 15u +}; + +// Shuffle table for converting RGBA to ARGB. +static uvec8 kShuffleMaskRGBAToARGB = { + 1u, 2u, 3u, 0u, 5u, 6u, 7u, 4u, 9u, 10u, 11u, 8u, 13u, 14u, 15u, 12u +}; + +// Convert BGRA to ARGB. +LIBYUV_API +int BGRAToARGB(const uint8* src_bgra, int src_stride_bgra, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { + return ARGBShuffle(src_bgra, src_stride_bgra, + dst_argb, dst_stride_argb, + (const uint8*)(&kShuffleMaskBGRAToARGB), + width, height); +} + +// Convert ARGB to BGRA (same as BGRAToARGB). +LIBYUV_API +int ARGBToBGRA(const uint8* src_bgra, int src_stride_bgra, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { + return ARGBShuffle(src_bgra, src_stride_bgra, + dst_argb, dst_stride_argb, + (const uint8*)(&kShuffleMaskBGRAToARGB), + width, height); +} + +// Convert ABGR to ARGB. +LIBYUV_API +int ABGRToARGB(const uint8* src_abgr, int src_stride_abgr, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { + return ARGBShuffle(src_abgr, src_stride_abgr, + dst_argb, dst_stride_argb, + (const uint8*)(&kShuffleMaskABGRToARGB), + width, height); +} + +// Convert ARGB to ABGR to (same as ABGRToARGB). +LIBYUV_API +int ARGBToABGR(const uint8* src_abgr, int src_stride_abgr, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { + return ARGBShuffle(src_abgr, src_stride_abgr, + dst_argb, dst_stride_argb, + (const uint8*)(&kShuffleMaskABGRToARGB), + width, height); +} + +// Convert RGBA to ARGB. +LIBYUV_API +int RGBAToARGB(const uint8* src_rgba, int src_stride_rgba, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { + return ARGBShuffle(src_rgba, src_stride_rgba, + dst_argb, dst_stride_argb, + (const uint8*)(&kShuffleMaskRGBAToARGB), + width, height); +} + +// Convert RGB24 to ARGB. +LIBYUV_API +int RGB24ToARGB(const uint8* src_rgb24, int src_stride_rgb24, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { + int y; + void (*RGB24ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix) = + RGB24ToARGBRow_C; + if (!src_rgb24 || !dst_argb || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_rgb24 = src_rgb24 + (height - 1) * src_stride_rgb24; + src_stride_rgb24 = -src_stride_rgb24; + } + // Coalesce rows. + if (src_stride_rgb24 == width * 3 && + dst_stride_argb == width * 4) { + width *= height; + height = 1; + src_stride_rgb24 = dst_stride_argb = 0; + } +#if defined(HAS_RGB24TOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && width >= 16 && + IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + RGB24ToARGBRow = RGB24ToARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + RGB24ToARGBRow = RGB24ToARGBRow_SSSE3; + } + } +#elif defined(HAS_RGB24TOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + RGB24ToARGBRow = RGB24ToARGBRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + RGB24ToARGBRow = RGB24ToARGBRow_NEON; + } + } +#endif + + for (y = 0; y < height; ++y) { + RGB24ToARGBRow(src_rgb24, dst_argb, width); + src_rgb24 += src_stride_rgb24; + dst_argb += dst_stride_argb; + } + return 0; +} + +// Convert RAW to ARGB. +LIBYUV_API +int RAWToARGB(const uint8* src_raw, int src_stride_raw, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { + int y; + void (*RAWToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix) = + RAWToARGBRow_C; + if (!src_raw || !dst_argb || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_raw = src_raw + (height - 1) * src_stride_raw; + src_stride_raw = -src_stride_raw; + } + // Coalesce rows. + if (src_stride_raw == width * 3 && + dst_stride_argb == width * 4) { + width *= height; + height = 1; + src_stride_raw = dst_stride_argb = 0; + } +#if defined(HAS_RAWTOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && width >= 16 && + IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + RAWToARGBRow = RAWToARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + RAWToARGBRow = RAWToARGBRow_SSSE3; + } + } +#elif defined(HAS_RAWTOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + RAWToARGBRow = RAWToARGBRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + RAWToARGBRow = RAWToARGBRow_NEON; + } + } +#endif + + for (y = 0; y < height; ++y) { + RAWToARGBRow(src_raw, dst_argb, width); + src_raw += src_stride_raw; + dst_argb += dst_stride_argb; + } + return 0; +} + +// Convert RGB565 to ARGB. +LIBYUV_API +int RGB565ToARGB(const uint8* src_rgb565, int src_stride_rgb565, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { + int y; + void (*RGB565ToARGBRow)(const uint8* src_rgb565, uint8* dst_argb, int pix) = + RGB565ToARGBRow_C; + if (!src_rgb565 || !dst_argb || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_rgb565 = src_rgb565 + (height - 1) * src_stride_rgb565; + src_stride_rgb565 = -src_stride_rgb565; + } + // Coalesce rows. + if (src_stride_rgb565 == width * 2 && + dst_stride_argb == width * 4) { + width *= height; + height = 1; + src_stride_rgb565 = dst_stride_argb = 0; + } +#if defined(HAS_RGB565TOARGBROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && width >= 8 && + IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + RGB565ToARGBRow = RGB565ToARGBRow_Any_SSE2; + if (IS_ALIGNED(width, 8)) { + RGB565ToARGBRow = RGB565ToARGBRow_SSE2; + } + } +#elif defined(HAS_RGB565TOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + RGB565ToARGBRow = RGB565ToARGBRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + RGB565ToARGBRow = RGB565ToARGBRow_NEON; + } + } +#endif + + for (y = 0; y < height; ++y) { + RGB565ToARGBRow(src_rgb565, dst_argb, width); + src_rgb565 += src_stride_rgb565; + dst_argb += dst_stride_argb; + } + return 0; +} + +// Convert ARGB1555 to ARGB. +LIBYUV_API +int ARGB1555ToARGB(const uint8* src_argb1555, int src_stride_argb1555, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { + int y; + void (*ARGB1555ToARGBRow)(const uint8* src_argb1555, uint8* dst_argb, + int pix) = ARGB1555ToARGBRow_C; + if (!src_argb1555 || !dst_argb || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_argb1555 = src_argb1555 + (height - 1) * src_stride_argb1555; + src_stride_argb1555 = -src_stride_argb1555; + } + // Coalesce rows. + if (src_stride_argb1555 == width * 2 && + dst_stride_argb == width * 4) { + width *= height; + height = 1; + src_stride_argb1555 = dst_stride_argb = 0; + } +#if defined(HAS_ARGB1555TOARGBROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && width >= 8 && + IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_SSE2; + if (IS_ALIGNED(width, 8)) { + ARGB1555ToARGBRow = ARGB1555ToARGBRow_SSE2; + } + } +#elif defined(HAS_ARGB1555TOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGB1555ToARGBRow = ARGB1555ToARGBRow_NEON; + } + } +#endif + + for (y = 0; y < height; ++y) { + ARGB1555ToARGBRow(src_argb1555, dst_argb, width); + src_argb1555 += src_stride_argb1555; + dst_argb += dst_stride_argb; + } + return 0; +} + +// Convert ARGB4444 to ARGB. +LIBYUV_API +int ARGB4444ToARGB(const uint8* src_argb4444, int src_stride_argb4444, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { + int y; + void (*ARGB4444ToARGBRow)(const uint8* src_argb4444, uint8* dst_argb, + int pix) = ARGB4444ToARGBRow_C; + if (!src_argb4444 || !dst_argb || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_argb4444 = src_argb4444 + (height - 1) * src_stride_argb4444; + src_stride_argb4444 = -src_stride_argb4444; + } + // Coalesce rows. + if (src_stride_argb4444 == width * 2 && + dst_stride_argb == width * 4) { + width *= height; + height = 1; + src_stride_argb4444 = dst_stride_argb = 0; + } +#if defined(HAS_ARGB4444TOARGBROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && width >= 8 && + IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_SSE2; + if (IS_ALIGNED(width, 8)) { + ARGB4444ToARGBRow = ARGB4444ToARGBRow_SSE2; + } + } +#elif defined(HAS_ARGB4444TOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGB4444ToARGBRow = ARGB4444ToARGBRow_NEON; + } + } +#endif + + for (y = 0; y < height; ++y) { + ARGB4444ToARGBRow(src_argb4444, dst_argb, width); + src_argb4444 += src_stride_argb4444; + dst_argb += dst_stride_argb; + } + return 0; +} + +// Convert NV12 to ARGB. +LIBYUV_API +int NV12ToARGB(const uint8* src_y, int src_stride_y, + const uint8* src_uv, int src_stride_uv, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { + int y; + void (*NV12ToARGBRow)(const uint8* y_buf, + const uint8* uv_buf, + uint8* rgb_buf, + int width) = NV12ToARGBRow_C; + if (!src_y || !src_uv || !dst_argb || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_argb = dst_argb + (height - 1) * dst_stride_argb; + dst_stride_argb = -dst_stride_argb; + } +#if defined(HAS_NV12TOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { + NV12ToARGBRow = NV12ToARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + NV12ToARGBRow = NV12ToARGBRow_Unaligned_SSSE3; + if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + NV12ToARGBRow = NV12ToARGBRow_SSSE3; + } + } + } +#elif defined(HAS_NV12TOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + NV12ToARGBRow = NV12ToARGBRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + NV12ToARGBRow = NV12ToARGBRow_NEON; + } + } +#endif + + for (y = 0; y < height; ++y) { + NV12ToARGBRow(src_y, src_uv, dst_argb, width); + dst_argb += dst_stride_argb; + src_y += src_stride_y; + if (y & 1) { + src_uv += src_stride_uv; + } + } + return 0; +} + +// Convert NV21 to ARGB. +LIBYUV_API +int NV21ToARGB(const uint8* src_y, int src_stride_y, + const uint8* src_uv, int src_stride_uv, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { + int y; + void (*NV21ToARGBRow)(const uint8* y_buf, + const uint8* uv_buf, + uint8* rgb_buf, + int width) = NV21ToARGBRow_C; + if (!src_y || !src_uv || !dst_argb || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_argb = dst_argb + (height - 1) * dst_stride_argb; + dst_stride_argb = -dst_stride_argb; + } +#if defined(HAS_NV21TOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { + NV21ToARGBRow = NV21ToARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + NV21ToARGBRow = NV21ToARGBRow_Unaligned_SSSE3; + if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + NV21ToARGBRow = NV21ToARGBRow_SSSE3; + } + } + } +#endif +#if defined(HAS_NV21TOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + NV21ToARGBRow = NV21ToARGBRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + NV21ToARGBRow = NV21ToARGBRow_NEON; + } + } +#endif + + for (y = 0; y < height; ++y) { + NV21ToARGBRow(src_y, src_uv, dst_argb, width); + dst_argb += dst_stride_argb; + src_y += src_stride_y; + if (y & 1) { + src_uv += src_stride_uv; + } + } + return 0; +} + +// Convert M420 to ARGB. +LIBYUV_API +int M420ToARGB(const uint8* src_m420, int src_stride_m420, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { + int y; + void (*NV12ToARGBRow)(const uint8* y_buf, + const uint8* uv_buf, + uint8* rgb_buf, + int width) = NV12ToARGBRow_C; + if (!src_m420 || !dst_argb || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_argb = dst_argb + (height - 1) * dst_stride_argb; + dst_stride_argb = -dst_stride_argb; + } +#if defined(HAS_NV12TOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { + NV12ToARGBRow = NV12ToARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + NV12ToARGBRow = NV12ToARGBRow_Unaligned_SSSE3; + if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + NV12ToARGBRow = NV12ToARGBRow_SSSE3; + } + } + } +#elif defined(HAS_NV12TOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + NV12ToARGBRow = NV12ToARGBRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + NV12ToARGBRow = NV12ToARGBRow_NEON; + } + } +#endif + + for (y = 0; y < height - 1; y += 2) { + NV12ToARGBRow(src_m420, src_m420 + src_stride_m420 * 2, dst_argb, width); + NV12ToARGBRow(src_m420 + src_stride_m420, src_m420 + src_stride_m420 * 2, + dst_argb + dst_stride_argb, width); + dst_argb += dst_stride_argb * 2; + src_m420 += src_stride_m420 * 3; + } + if (height & 1) { + NV12ToARGBRow(src_m420, src_m420 + src_stride_m420 * 2, dst_argb, width); + } + return 0; +} + +// Convert YUY2 to ARGB. +LIBYUV_API +int YUY2ToARGB(const uint8* src_yuy2, int src_stride_yuy2, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { + int y; + void (*YUY2ToARGBRow)(const uint8* src_yuy2, uint8* dst_argb, int pix) = + YUY2ToARGBRow_C; + if (!src_yuy2 || !dst_argb || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_yuy2 = src_yuy2 + (height - 1) * src_stride_yuy2; + src_stride_yuy2 = -src_stride_yuy2; + } + // Coalesce rows. + if (src_stride_yuy2 == width * 2 && + dst_stride_argb == width * 4) { + width *= height; + height = 1; + src_stride_yuy2 = dst_stride_argb = 0; + } +#if defined(HAS_YUY2TOARGBROW_SSSE3) + // Posix is 16, Windows is 8. + if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { + YUY2ToARGBRow = YUY2ToARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + YUY2ToARGBRow = YUY2ToARGBRow_Unaligned_SSSE3; + if (IS_ALIGNED(src_yuy2, 16) && IS_ALIGNED(src_stride_yuy2, 16) && + IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + YUY2ToARGBRow = YUY2ToARGBRow_SSSE3; + } + } + } +#elif defined(HAS_YUY2TOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + YUY2ToARGBRow = YUY2ToARGBRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + YUY2ToARGBRow = YUY2ToARGBRow_NEON; + } + } +#endif + for (y = 0; y < height; ++y) { + YUY2ToARGBRow(src_yuy2, dst_argb, width); + src_yuy2 += src_stride_yuy2; + dst_argb += dst_stride_argb; + } + return 0; +} + +// Convert UYVY to ARGB. +LIBYUV_API +int UYVYToARGB(const uint8* src_uyvy, int src_stride_uyvy, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { + int y; + void (*UYVYToARGBRow)(const uint8* src_uyvy, uint8* dst_argb, int pix) = + UYVYToARGBRow_C; + if (!src_uyvy || !dst_argb || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_uyvy = src_uyvy + (height - 1) * src_stride_uyvy; + src_stride_uyvy = -src_stride_uyvy; + } + // Coalesce rows. + if (src_stride_uyvy == width * 2 && + dst_stride_argb == width * 4) { + width *= height; + height = 1; + src_stride_uyvy = dst_stride_argb = 0; + } +#if defined(HAS_UYVYTOARGBROW_SSSE3) + // Posix is 16, Windows is 8. + if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { + UYVYToARGBRow = UYVYToARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + UYVYToARGBRow = UYVYToARGBRow_Unaligned_SSSE3; + if (IS_ALIGNED(src_uyvy, 16) && IS_ALIGNED(src_stride_uyvy, 16) && + IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + UYVYToARGBRow = UYVYToARGBRow_SSSE3; + } + } + } +#elif defined(HAS_UYVYTOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + UYVYToARGBRow = UYVYToARGBRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + UYVYToARGBRow = UYVYToARGBRow_NEON; + } + } +#endif + for (y = 0; y < height; ++y) { + UYVYToARGBRow(src_uyvy, dst_argb, width); + src_uyvy += src_stride_uyvy; + dst_argb += dst_stride_argb; + } + return 0; +} + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/TMessagesProj/jni/libyuv/source/convert_from.cc b/TMessagesProj/jni/libyuv/source/convert_from.cc new file mode 100644 index 000000000..c1a2f62f0 --- /dev/null +++ b/TMessagesProj/jni/libyuv/source/convert_from.cc @@ -0,0 +1,1210 @@ +/* + * Copyright 2012 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/convert_from.h" + +#include "libyuv/basic_types.h" +#include "libyuv/convert.h" // For I420Copy +#include "libyuv/cpu_id.h" +#include "libyuv/format_conversion.h" +#include "libyuv/planar_functions.h" +#include "libyuv/rotate.h" +#include "libyuv/scale.h" // For ScalePlane() +#include "libyuv/video_common.h" +#include "libyuv/row.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +#define SUBSAMPLE(v, a, s) (v < 0) ? (-((-v + a) >> s)) : ((v + a) >> s) +static __inline int Abs(int v) { + return v >= 0 ? v : -v; +} + +// I420 To any I4xx YUV format with mirroring. +static int I420ToI4xx(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int src_y_width, int src_y_height, + int dst_uv_width, int dst_uv_height) { + const int dst_y_width = Abs(src_y_width); + const int dst_y_height = Abs(src_y_height); + const int src_uv_width = SUBSAMPLE(src_y_width, 1, 1); + const int src_uv_height = SUBSAMPLE(src_y_height, 1, 1); + if (src_y_width == 0 || src_y_height == 0 || + dst_uv_width <= 0 || dst_uv_height <= 0) { + return -1; + } + ScalePlane(src_y, src_stride_y, src_y_width, src_y_height, + dst_y, dst_stride_y, dst_y_width, dst_y_height, + kFilterBilinear); + ScalePlane(src_u, src_stride_u, src_uv_width, src_uv_height, + dst_u, dst_stride_u, dst_uv_width, dst_uv_height, + kFilterBilinear); + ScalePlane(src_v, src_stride_v, src_uv_width, src_uv_height, + dst_v, dst_stride_v, dst_uv_width, dst_uv_height, + kFilterBilinear); + return 0; +} + +// 420 chroma is 1/2 width, 1/2 height +// 422 chroma is 1/2 width, 1x height +LIBYUV_API +int I420ToI422(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height) { + const int dst_uv_width = (Abs(width) + 1) >> 1; + const int dst_uv_height = Abs(height); + return I420ToI4xx(src_y, src_stride_y, + src_u, src_stride_u, + src_v, src_stride_v, + dst_y, dst_stride_y, + dst_u, dst_stride_u, + dst_v, dst_stride_v, + width, height, + dst_uv_width, dst_uv_height); +} + +// 420 chroma is 1/2 width, 1/2 height +// 444 chroma is 1x width, 1x height +LIBYUV_API +int I420ToI444(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height) { + const int dst_uv_width = Abs(width); + const int dst_uv_height = Abs(height); + return I420ToI4xx(src_y, src_stride_y, + src_u, src_stride_u, + src_v, src_stride_v, + dst_y, dst_stride_y, + dst_u, dst_stride_u, + dst_v, dst_stride_v, + width, height, + dst_uv_width, dst_uv_height); +} + +// 420 chroma is 1/2 width, 1/2 height +// 411 chroma is 1/4 width, 1x height +LIBYUV_API +int I420ToI411(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height) { + const int dst_uv_width = (Abs(width) + 3) >> 2; + const int dst_uv_height = Abs(height); + return I420ToI4xx(src_y, src_stride_y, + src_u, src_stride_u, + src_v, src_stride_v, + dst_y, dst_stride_y, + dst_u, dst_stride_u, + dst_v, dst_stride_v, + width, height, + dst_uv_width, dst_uv_height); +} + +// Copy to I400. Source can be I420,422,444,400,NV12,NV21 +LIBYUV_API +int I400Copy(const uint8* src_y, int src_stride_y, + uint8* dst_y, int dst_stride_y, + int width, int height) { + if (!src_y || !dst_y || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_y = src_y + (height - 1) * src_stride_y; + src_stride_y = -src_stride_y; + } + CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height); + return 0; +} + +LIBYUV_API +int I422ToYUY2(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_yuy2, int dst_stride_yuy2, + int width, int height) { + int y; + void (*I422ToYUY2Row)(const uint8* src_y, const uint8* src_u, + const uint8* src_v, uint8* dst_yuy2, int width) = + I422ToYUY2Row_C; + if (!src_y || !src_u || !src_v || !dst_yuy2 || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_yuy2 = dst_yuy2 + (height - 1) * dst_stride_yuy2; + dst_stride_yuy2 = -dst_stride_yuy2; + } + // Coalesce rows. + if (src_stride_y == width && + src_stride_u * 2 == width && + src_stride_v * 2 == width && + dst_stride_yuy2 == width * 2) { + width *= height; + height = 1; + src_stride_y = src_stride_u = src_stride_v = dst_stride_yuy2 = 0; + } +#if defined(HAS_I422TOYUY2ROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && width >= 16) { + I422ToYUY2Row = I422ToYUY2Row_Any_SSE2; + if (IS_ALIGNED(width, 16)) { + I422ToYUY2Row = I422ToYUY2Row_SSE2; + } + } +#elif defined(HAS_I422TOYUY2ROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 16) { + I422ToYUY2Row = I422ToYUY2Row_Any_NEON; + if (IS_ALIGNED(width, 16)) { + I422ToYUY2Row = I422ToYUY2Row_NEON; + } + } +#endif + + for (y = 0; y < height; ++y) { + I422ToYUY2Row(src_y, src_u, src_v, dst_yuy2, width); + src_y += src_stride_y; + src_u += src_stride_u; + src_v += src_stride_v; + dst_yuy2 += dst_stride_yuy2; + } + return 0; +} + +LIBYUV_API +int I420ToYUY2(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_yuy2, int dst_stride_yuy2, + int width, int height) { + int y; + void (*I422ToYUY2Row)(const uint8* src_y, const uint8* src_u, + const uint8* src_v, uint8* dst_yuy2, int width) = + I422ToYUY2Row_C; + if (!src_y || !src_u || !src_v || !dst_yuy2 || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_yuy2 = dst_yuy2 + (height - 1) * dst_stride_yuy2; + dst_stride_yuy2 = -dst_stride_yuy2; + } +#if defined(HAS_I422TOYUY2ROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && width >= 16) { + I422ToYUY2Row = I422ToYUY2Row_Any_SSE2; + if (IS_ALIGNED(width, 16)) { + I422ToYUY2Row = I422ToYUY2Row_SSE2; + } + } +#elif defined(HAS_I422TOYUY2ROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 16) { + I422ToYUY2Row = I422ToYUY2Row_Any_NEON; + if (IS_ALIGNED(width, 16)) { + I422ToYUY2Row = I422ToYUY2Row_NEON; + } + } +#endif + + for (y = 0; y < height - 1; y += 2) { + I422ToYUY2Row(src_y, src_u, src_v, dst_yuy2, width); + I422ToYUY2Row(src_y + src_stride_y, src_u, src_v, + dst_yuy2 + dst_stride_yuy2, width); + src_y += src_stride_y * 2; + src_u += src_stride_u; + src_v += src_stride_v; + dst_yuy2 += dst_stride_yuy2 * 2; + } + if (height & 1) { + I422ToYUY2Row(src_y, src_u, src_v, dst_yuy2, width); + } + return 0; +} + +LIBYUV_API +int I422ToUYVY(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_uyvy, int dst_stride_uyvy, + int width, int height) { + int y; + void (*I422ToUYVYRow)(const uint8* src_y, const uint8* src_u, + const uint8* src_v, uint8* dst_uyvy, int width) = + I422ToUYVYRow_C; + if (!src_y || !src_u || !src_v || !dst_uyvy || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_uyvy = dst_uyvy + (height - 1) * dst_stride_uyvy; + dst_stride_uyvy = -dst_stride_uyvy; + } + // Coalesce rows. + if (src_stride_y == width && + src_stride_u * 2 == width && + src_stride_v * 2 == width && + dst_stride_uyvy == width * 2) { + width *= height; + height = 1; + src_stride_y = src_stride_u = src_stride_v = dst_stride_uyvy = 0; + } +#if defined(HAS_I422TOUYVYROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && width >= 16) { + I422ToUYVYRow = I422ToUYVYRow_Any_SSE2; + if (IS_ALIGNED(width, 16)) { + I422ToUYVYRow = I422ToUYVYRow_SSE2; + } + } +#elif defined(HAS_I422TOUYVYROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 16) { + I422ToUYVYRow = I422ToUYVYRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + I422ToUYVYRow = I422ToUYVYRow_NEON; + } + } +#endif + + for (y = 0; y < height; ++y) { + I422ToUYVYRow(src_y, src_u, src_v, dst_uyvy, width); + src_y += src_stride_y; + src_u += src_stride_u; + src_v += src_stride_v; + dst_uyvy += dst_stride_uyvy; + } + return 0; +} + +LIBYUV_API +int I420ToUYVY(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_uyvy, int dst_stride_uyvy, + int width, int height) { + int y; + void (*I422ToUYVYRow)(const uint8* src_y, const uint8* src_u, + const uint8* src_v, uint8* dst_uyvy, int width) = + I422ToUYVYRow_C; + if (!src_y || !src_u || !src_v || !dst_uyvy || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_uyvy = dst_uyvy + (height - 1) * dst_stride_uyvy; + dst_stride_uyvy = -dst_stride_uyvy; + } +#if defined(HAS_I422TOUYVYROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && width >= 16) { + I422ToUYVYRow = I422ToUYVYRow_Any_SSE2; + if (IS_ALIGNED(width, 16)) { + I422ToUYVYRow = I422ToUYVYRow_SSE2; + } + } +#elif defined(HAS_I422TOUYVYROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 16) { + I422ToUYVYRow = I422ToUYVYRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + I422ToUYVYRow = I422ToUYVYRow_NEON; + } + } +#endif + + for (y = 0; y < height - 1; y += 2) { + I422ToUYVYRow(src_y, src_u, src_v, dst_uyvy, width); + I422ToUYVYRow(src_y + src_stride_y, src_u, src_v, + dst_uyvy + dst_stride_uyvy, width); + src_y += src_stride_y * 2; + src_u += src_stride_u; + src_v += src_stride_v; + dst_uyvy += dst_stride_uyvy * 2; + } + if (height & 1) { + I422ToUYVYRow(src_y, src_u, src_v, dst_uyvy, width); + } + return 0; +} + +LIBYUV_API +int I420ToNV12(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_y, int dst_stride_y, + uint8* dst_uv, int dst_stride_uv, + int width, int height) { + int y; + void (*MergeUVRow_)(const uint8* src_u, const uint8* src_v, uint8* dst_uv, + int width) = MergeUVRow_C; + // Coalesce rows. + int halfwidth = (width + 1) >> 1; + int halfheight = (height + 1) >> 1; + if (!src_y || !src_u || !src_v || !dst_y || !dst_uv || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + halfheight = (height + 1) >> 1; + dst_y = dst_y + (height - 1) * dst_stride_y; + dst_uv = dst_uv + (halfheight - 1) * dst_stride_uv; + dst_stride_y = -dst_stride_y; + dst_stride_uv = -dst_stride_uv; + } + if (src_stride_y == width && + dst_stride_y == width) { + width *= height; + height = 1; + src_stride_y = dst_stride_y = 0; + } + // Coalesce rows. + if (src_stride_u == halfwidth && + src_stride_v == halfwidth && + dst_stride_uv == halfwidth * 2) { + halfwidth *= halfheight; + halfheight = 1; + src_stride_u = src_stride_v = dst_stride_uv = 0; + } +#if defined(HAS_MERGEUVROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && halfwidth >= 16) { + MergeUVRow_ = MergeUVRow_Any_SSE2; + if (IS_ALIGNED(halfwidth, 16)) { + MergeUVRow_ = MergeUVRow_Unaligned_SSE2; + if (IS_ALIGNED(src_u, 16) && IS_ALIGNED(src_stride_u, 16) && + IS_ALIGNED(src_v, 16) && IS_ALIGNED(src_stride_v, 16) && + IS_ALIGNED(dst_uv, 16) && IS_ALIGNED(dst_stride_uv, 16)) { + MergeUVRow_ = MergeUVRow_SSE2; + } + } + } +#endif +#if defined(HAS_MERGEUVROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2) && halfwidth >= 32) { + MergeUVRow_ = MergeUVRow_Any_AVX2; + if (IS_ALIGNED(halfwidth, 32)) { + MergeUVRow_ = MergeUVRow_AVX2; + } + } +#endif +#if defined(HAS_MERGEUVROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && halfwidth >= 16) { + MergeUVRow_ = MergeUVRow_Any_NEON; + if (IS_ALIGNED(halfwidth, 16)) { + MergeUVRow_ = MergeUVRow_NEON; + } + } +#endif + + CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height); + for (y = 0; y < halfheight; ++y) { + // Merge a row of U and V into a row of UV. + MergeUVRow_(src_u, src_v, dst_uv, halfwidth); + src_u += src_stride_u; + src_v += src_stride_v; + dst_uv += dst_stride_uv; + } + return 0; +} + +LIBYUV_API +int I420ToNV21(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_y, int dst_stride_y, + uint8* dst_vu, int dst_stride_vu, + int width, int height) { + return I420ToNV12(src_y, src_stride_y, + src_v, src_stride_v, + src_u, src_stride_u, + dst_y, src_stride_y, + dst_vu, dst_stride_vu, + width, height); +} + +// Convert I420 to ARGB. +LIBYUV_API +int I420ToARGB(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { + int y; + void (*I422ToARGBRow)(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width) = I422ToARGBRow_C; + if (!src_y || !src_u || !src_v || !dst_argb || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_argb = dst_argb + (height - 1) * dst_stride_argb; + dst_stride_argb = -dst_stride_argb; + } +#if defined(HAS_I422TOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { + I422ToARGBRow = I422ToARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I422ToARGBRow = I422ToARGBRow_Unaligned_SSSE3; + if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + I422ToARGBRow = I422ToARGBRow_SSSE3; + } + } + } +#endif +#if defined(HAS_I422TOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2) && width >= 16) { + I422ToARGBRow = I422ToARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + I422ToARGBRow = I422ToARGBRow_AVX2; + } + } +#endif +#if defined(HAS_I422TOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + I422ToARGBRow = I422ToARGBRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + I422ToARGBRow = I422ToARGBRow_NEON; + } + } +#endif +#if defined(HAS_I422TOARGBROW_MIPS_DSPR2) + if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) && + IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) && + IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) && + IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) && + IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) { + I422ToARGBRow = I422ToARGBRow_MIPS_DSPR2; + } +#endif + + for (y = 0; y < height; ++y) { + I422ToARGBRow(src_y, src_u, src_v, dst_argb, width); + dst_argb += dst_stride_argb; + src_y += src_stride_y; + if (y & 1) { + src_u += src_stride_u; + src_v += src_stride_v; + } + } + return 0; +} + +// Convert I420 to BGRA. +LIBYUV_API +int I420ToBGRA(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_bgra, int dst_stride_bgra, + int width, int height) { + int y; + void (*I422ToBGRARow)(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width) = I422ToBGRARow_C; + if (!src_y || !src_u || !src_v || !dst_bgra || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_bgra = dst_bgra + (height - 1) * dst_stride_bgra; + dst_stride_bgra = -dst_stride_bgra; + } +#if defined(HAS_I422TOBGRAROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { + I422ToBGRARow = I422ToBGRARow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I422ToBGRARow = I422ToBGRARow_Unaligned_SSSE3; + if (IS_ALIGNED(dst_bgra, 16) && IS_ALIGNED(dst_stride_bgra, 16)) { + I422ToBGRARow = I422ToBGRARow_SSSE3; + } + } + } +#elif defined(HAS_I422TOBGRAROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + I422ToBGRARow = I422ToBGRARow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + I422ToBGRARow = I422ToBGRARow_NEON; + } + } +#elif defined(HAS_I422TOBGRAROW_MIPS_DSPR2) + if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) && + IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) && + IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) && + IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) && + IS_ALIGNED(dst_bgra, 4) && IS_ALIGNED(dst_stride_bgra, 4)) { + I422ToBGRARow = I422ToBGRARow_MIPS_DSPR2; + } +#endif + + for (y = 0; y < height; ++y) { + I422ToBGRARow(src_y, src_u, src_v, dst_bgra, width); + dst_bgra += dst_stride_bgra; + src_y += src_stride_y; + if (y & 1) { + src_u += src_stride_u; + src_v += src_stride_v; + } + } + return 0; +} + +// Convert I420 to ABGR. +LIBYUV_API +int I420ToABGR(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_abgr, int dst_stride_abgr, + int width, int height) { + int y; + void (*I422ToABGRRow)(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width) = I422ToABGRRow_C; + if (!src_y || !src_u || !src_v || !dst_abgr || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_abgr = dst_abgr + (height - 1) * dst_stride_abgr; + dst_stride_abgr = -dst_stride_abgr; + } +#if defined(HAS_I422TOABGRROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { + I422ToABGRRow = I422ToABGRRow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I422ToABGRRow = I422ToABGRRow_Unaligned_SSSE3; + if (IS_ALIGNED(dst_abgr, 16) && IS_ALIGNED(dst_stride_abgr, 16)) { + I422ToABGRRow = I422ToABGRRow_SSSE3; + } + } + } +#elif defined(HAS_I422TOABGRROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + I422ToABGRRow = I422ToABGRRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + I422ToABGRRow = I422ToABGRRow_NEON; + } + } +#endif + + for (y = 0; y < height; ++y) { + I422ToABGRRow(src_y, src_u, src_v, dst_abgr, width); + dst_abgr += dst_stride_abgr; + src_y += src_stride_y; + if (y & 1) { + src_u += src_stride_u; + src_v += src_stride_v; + } + } + return 0; +} + +// Convert I420 to RGBA. +LIBYUV_API +int I420ToRGBA(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_rgba, int dst_stride_rgba, + int width, int height) { + int y; + void (*I422ToRGBARow)(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width) = I422ToRGBARow_C; + if (!src_y || !src_u || !src_v || !dst_rgba || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_rgba = dst_rgba + (height - 1) * dst_stride_rgba; + dst_stride_rgba = -dst_stride_rgba; + } +#if defined(HAS_I422TORGBAROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { + I422ToRGBARow = I422ToRGBARow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I422ToRGBARow = I422ToRGBARow_Unaligned_SSSE3; + if (IS_ALIGNED(dst_rgba, 16) && IS_ALIGNED(dst_stride_rgba, 16)) { + I422ToRGBARow = I422ToRGBARow_SSSE3; + } + } + } +#elif defined(HAS_I422TORGBAROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + I422ToRGBARow = I422ToRGBARow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + I422ToRGBARow = I422ToRGBARow_NEON; + } + } +#endif + + for (y = 0; y < height; ++y) { + I422ToRGBARow(src_y, src_u, src_v, dst_rgba, width); + dst_rgba += dst_stride_rgba; + src_y += src_stride_y; + if (y & 1) { + src_u += src_stride_u; + src_v += src_stride_v; + } + } + return 0; +} + +// Convert I420 to RGB24. +LIBYUV_API +int I420ToRGB24(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_rgb24, int dst_stride_rgb24, + int width, int height) { + int y; + void (*I422ToRGB24Row)(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width) = I422ToRGB24Row_C; + if (!src_y || !src_u || !src_v || !dst_rgb24 || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_rgb24 = dst_rgb24 + (height - 1) * dst_stride_rgb24; + dst_stride_rgb24 = -dst_stride_rgb24; + } +#if defined(HAS_I422TORGB24ROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { + I422ToRGB24Row = I422ToRGB24Row_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I422ToRGB24Row = I422ToRGB24Row_SSSE3; + } + } +#elif defined(HAS_I422TORGB24ROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + I422ToRGB24Row = I422ToRGB24Row_Any_NEON; + if (IS_ALIGNED(width, 8)) { + I422ToRGB24Row = I422ToRGB24Row_NEON; + } + } +#endif + + for (y = 0; y < height; ++y) { + I422ToRGB24Row(src_y, src_u, src_v, dst_rgb24, width); + dst_rgb24 += dst_stride_rgb24; + src_y += src_stride_y; + if (y & 1) { + src_u += src_stride_u; + src_v += src_stride_v; + } + } + return 0; +} + +// Convert I420 to RAW. +LIBYUV_API +int I420ToRAW(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_raw, int dst_stride_raw, + int width, int height) { + int y; + void (*I422ToRAWRow)(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width) = I422ToRAWRow_C; + if (!src_y || !src_u || !src_v || !dst_raw || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_raw = dst_raw + (height - 1) * dst_stride_raw; + dst_stride_raw = -dst_stride_raw; + } +#if defined(HAS_I422TORAWROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { + I422ToRAWRow = I422ToRAWRow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I422ToRAWRow = I422ToRAWRow_SSSE3; + } + } +#elif defined(HAS_I422TORAWROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + I422ToRAWRow = I422ToRAWRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + I422ToRAWRow = I422ToRAWRow_NEON; + } + } +#endif + + for (y = 0; y < height; ++y) { + I422ToRAWRow(src_y, src_u, src_v, dst_raw, width); + dst_raw += dst_stride_raw; + src_y += src_stride_y; + if (y & 1) { + src_u += src_stride_u; + src_v += src_stride_v; + } + } + return 0; +} + +// Convert I420 to ARGB1555. +LIBYUV_API +int I420ToARGB1555(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_argb1555, int dst_stride_argb1555, + int width, int height) { + int y; + void (*I422ToARGB1555Row)(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width) = I422ToARGB1555Row_C; + if (!src_y || !src_u || !src_v || !dst_argb1555 || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_argb1555 = dst_argb1555 + (height - 1) * dst_stride_argb1555; + dst_stride_argb1555 = -dst_stride_argb1555; + } +#if defined(HAS_I422TOARGB1555ROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { + I422ToARGB1555Row = I422ToARGB1555Row_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I422ToARGB1555Row = I422ToARGB1555Row_SSSE3; + } + } +#elif defined(HAS_I422TOARGB1555ROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + I422ToARGB1555Row = I422ToARGB1555Row_Any_NEON; + if (IS_ALIGNED(width, 8)) { + I422ToARGB1555Row = I422ToARGB1555Row_NEON; + } + } +#endif + + for (y = 0; y < height; ++y) { + I422ToARGB1555Row(src_y, src_u, src_v, dst_argb1555, width); + dst_argb1555 += dst_stride_argb1555; + src_y += src_stride_y; + if (y & 1) { + src_u += src_stride_u; + src_v += src_stride_v; + } + } + return 0; +} + + +// Convert I420 to ARGB4444. +LIBYUV_API +int I420ToARGB4444(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_argb4444, int dst_stride_argb4444, + int width, int height) { + int y; + void (*I422ToARGB4444Row)(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width) = I422ToARGB4444Row_C; + if (!src_y || !src_u || !src_v || !dst_argb4444 || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_argb4444 = dst_argb4444 + (height - 1) * dst_stride_argb4444; + dst_stride_argb4444 = -dst_stride_argb4444; + } +#if defined(HAS_I422TOARGB4444ROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { + I422ToARGB4444Row = I422ToARGB4444Row_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I422ToARGB4444Row = I422ToARGB4444Row_SSSE3; + } + } +#elif defined(HAS_I422TOARGB4444ROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + I422ToARGB4444Row = I422ToARGB4444Row_Any_NEON; + if (IS_ALIGNED(width, 8)) { + I422ToARGB4444Row = I422ToARGB4444Row_NEON; + } + } +#endif + + for (y = 0; y < height; ++y) { + I422ToARGB4444Row(src_y, src_u, src_v, dst_argb4444, width); + dst_argb4444 += dst_stride_argb4444; + src_y += src_stride_y; + if (y & 1) { + src_u += src_stride_u; + src_v += src_stride_v; + } + } + return 0; +} + +// Convert I420 to RGB565. +LIBYUV_API +int I420ToRGB565(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_rgb565, int dst_stride_rgb565, + int width, int height) { + int y; + void (*I422ToRGB565Row)(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width) = I422ToRGB565Row_C; + if (!src_y || !src_u || !src_v || !dst_rgb565 || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565; + dst_stride_rgb565 = -dst_stride_rgb565; + } +#if defined(HAS_I422TORGB565ROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { + I422ToRGB565Row = I422ToRGB565Row_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I422ToRGB565Row = I422ToRGB565Row_SSSE3; + } + } +#elif defined(HAS_I422TORGB565ROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + I422ToRGB565Row = I422ToRGB565Row_Any_NEON; + if (IS_ALIGNED(width, 8)) { + I422ToRGB565Row = I422ToRGB565Row_NEON; + } + } +#endif + + for (y = 0; y < height; ++y) { + I422ToRGB565Row(src_y, src_u, src_v, dst_rgb565, width); + dst_rgb565 += dst_stride_rgb565; + src_y += src_stride_y; + if (y & 1) { + src_u += src_stride_u; + src_v += src_stride_v; + } + } + return 0; +} + +// Convert I420 to specified format +LIBYUV_API +int ConvertFromI420(const uint8* y, int y_stride, + const uint8* u, int u_stride, + const uint8* v, int v_stride, + uint8* dst_sample, int dst_sample_stride, + int width, int height, + uint32 fourcc) { + uint32 format = CanonicalFourCC(fourcc); + int r = 0; + if (!y || !u|| !v || !dst_sample || + width <= 0 || height == 0) { + return -1; + } + switch (format) { + // Single plane formats + case FOURCC_YUY2: + r = I420ToYUY2(y, y_stride, + u, u_stride, + v, v_stride, + dst_sample, + dst_sample_stride ? dst_sample_stride : width * 2, + width, height); + break; + case FOURCC_UYVY: + r = I420ToUYVY(y, y_stride, + u, u_stride, + v, v_stride, + dst_sample, + dst_sample_stride ? dst_sample_stride : width * 2, + width, height); + break; + case FOURCC_RGBP: + r = I420ToRGB565(y, y_stride, + u, u_stride, + v, v_stride, + dst_sample, + dst_sample_stride ? dst_sample_stride : width * 2, + width, height); + break; + case FOURCC_RGBO: + r = I420ToARGB1555(y, y_stride, + u, u_stride, + v, v_stride, + dst_sample, + dst_sample_stride ? dst_sample_stride : width * 2, + width, height); + break; + case FOURCC_R444: + r = I420ToARGB4444(y, y_stride, + u, u_stride, + v, v_stride, + dst_sample, + dst_sample_stride ? dst_sample_stride : width * 2, + width, height); + break; + case FOURCC_24BG: + r = I420ToRGB24(y, y_stride, + u, u_stride, + v, v_stride, + dst_sample, + dst_sample_stride ? dst_sample_stride : width * 3, + width, height); + break; + case FOURCC_RAW: + r = I420ToRAW(y, y_stride, + u, u_stride, + v, v_stride, + dst_sample, + dst_sample_stride ? dst_sample_stride : width * 3, + width, height); + break; + case FOURCC_ARGB: + r = I420ToARGB(y, y_stride, + u, u_stride, + v, v_stride, + dst_sample, + dst_sample_stride ? dst_sample_stride : width * 4, + width, height); + break; + case FOURCC_BGRA: + r = I420ToBGRA(y, y_stride, + u, u_stride, + v, v_stride, + dst_sample, + dst_sample_stride ? dst_sample_stride : width * 4, + width, height); + break; + case FOURCC_ABGR: + r = I420ToABGR(y, y_stride, + u, u_stride, + v, v_stride, + dst_sample, + dst_sample_stride ? dst_sample_stride : width * 4, + width, height); + break; + case FOURCC_RGBA: + r = I420ToRGBA(y, y_stride, + u, u_stride, + v, v_stride, + dst_sample, + dst_sample_stride ? dst_sample_stride : width * 4, + width, height); + break; + case FOURCC_BGGR: + r = I420ToBayerBGGR(y, y_stride, + u, u_stride, + v, v_stride, + dst_sample, + dst_sample_stride ? dst_sample_stride : width, + width, height); + break; + case FOURCC_GBRG: + r = I420ToBayerGBRG(y, y_stride, + u, u_stride, + v, v_stride, + dst_sample, + dst_sample_stride ? dst_sample_stride : width, + width, height); + break; + case FOURCC_GRBG: + r = I420ToBayerGRBG(y, y_stride, + u, u_stride, + v, v_stride, + dst_sample, + dst_sample_stride ? dst_sample_stride : width, + width, height); + break; + case FOURCC_RGGB: + r = I420ToBayerRGGB(y, y_stride, + u, u_stride, + v, v_stride, + dst_sample, + dst_sample_stride ? dst_sample_stride : width, + width, height); + break; + case FOURCC_I400: + r = I400Copy(y, y_stride, + dst_sample, + dst_sample_stride ? dst_sample_stride : width, + width, height); + break; + case FOURCC_NV12: { + uint8* dst_uv = dst_sample + width * height; + r = I420ToNV12(y, y_stride, + u, u_stride, + v, v_stride, + dst_sample, + dst_sample_stride ? dst_sample_stride : width, + dst_uv, + dst_sample_stride ? dst_sample_stride : width, + width, height); + break; + } + case FOURCC_NV21: { + uint8* dst_vu = dst_sample + width * height; + r = I420ToNV21(y, y_stride, + u, u_stride, + v, v_stride, + dst_sample, + dst_sample_stride ? dst_sample_stride : width, + dst_vu, + dst_sample_stride ? dst_sample_stride : width, + width, height); + break; + } + // TODO(fbarchard): Add M420 and Q420. + // Triplanar formats + // TODO(fbarchard): halfstride instead of halfwidth + case FOURCC_I420: + case FOURCC_YU12: + case FOURCC_YV12: { + int halfwidth = (width + 1) / 2; + int halfheight = (height + 1) / 2; + uint8* dst_u; + uint8* dst_v; + if (format == FOURCC_YV12) { + dst_v = dst_sample + width * height; + dst_u = dst_v + halfwidth * halfheight; + } else { + dst_u = dst_sample + width * height; + dst_v = dst_u + halfwidth * halfheight; + } + r = I420Copy(y, y_stride, + u, u_stride, + v, v_stride, + dst_sample, width, + dst_u, halfwidth, + dst_v, halfwidth, + width, height); + break; + } + case FOURCC_I422: + case FOURCC_YV16: { + int halfwidth = (width + 1) / 2; + uint8* dst_u; + uint8* dst_v; + if (format == FOURCC_YV16) { + dst_v = dst_sample + width * height; + dst_u = dst_v + halfwidth * height; + } else { + dst_u = dst_sample + width * height; + dst_v = dst_u + halfwidth * height; + } + r = I420ToI422(y, y_stride, + u, u_stride, + v, v_stride, + dst_sample, width, + dst_u, halfwidth, + dst_v, halfwidth, + width, height); + break; + } + case FOURCC_I444: + case FOURCC_YV24: { + uint8* dst_u; + uint8* dst_v; + if (format == FOURCC_YV24) { + dst_v = dst_sample + width * height; + dst_u = dst_v + width * height; + } else { + dst_u = dst_sample + width * height; + dst_v = dst_u + width * height; + } + r = I420ToI444(y, y_stride, + u, u_stride, + v, v_stride, + dst_sample, width, + dst_u, width, + dst_v, width, + width, height); + break; + } + case FOURCC_I411: { + int quarterwidth = (width + 3) / 4; + uint8* dst_u = dst_sample + width * height; + uint8* dst_v = dst_u + quarterwidth * height; + r = I420ToI411(y, y_stride, + u, u_stride, + v, v_stride, + dst_sample, width, + dst_u, quarterwidth, + dst_v, quarterwidth, + width, height); + break; + } + + // Formats not supported - MJPG, biplanar, some rgb formats. + default: + return -1; // unknown fourcc - return failure code. + } + return r; +} + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/TMessagesProj/jni/libyuv/source/convert_from_argb.cc b/TMessagesProj/jni/libyuv/source/convert_from_argb.cc new file mode 100644 index 000000000..de461ddb0 --- /dev/null +++ b/TMessagesProj/jni/libyuv/source/convert_from_argb.cc @@ -0,0 +1,1133 @@ +/* + * Copyright 2012 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/convert_from_argb.h" + +#include "libyuv/basic_types.h" +#include "libyuv/cpu_id.h" +#include "libyuv/format_conversion.h" +#include "libyuv/planar_functions.h" +#include "libyuv/row.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// ARGB little endian (bgra in memory) to I444 +LIBYUV_API +int ARGBToI444(const uint8* src_argb, int src_stride_argb, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height) { + int y; + void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) = + ARGBToYRow_C; + void (*ARGBToUV444Row)(const uint8* src_argb, uint8* dst_u, uint8* dst_v, + int pix) = ARGBToUV444Row_C; + if (!src_argb || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) { + return -1; + } + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } + // Coalesce rows. + if (src_stride_argb == width * 4 && + dst_stride_y == width && + dst_stride_u == width && + dst_stride_v == width) { + width *= height; + height = 1; + src_stride_argb = dst_stride_y = dst_stride_u = dst_stride_v = 0; + } +#if defined(HAS_ARGBTOUV444ROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { + ARGBToUV444Row = ARGBToUV444Row_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToUV444Row = ARGBToUV444Row_Unaligned_SSSE3; + if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) { + ARGBToUV444Row = ARGBToUV444Row_SSSE3; + } + } + } +#elif defined(HAS_ARGBTOUV444ROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + ARGBToUV444Row = ARGBToUV444Row_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGBToUV444Row = ARGBToUV444Row_NEON; + } + } +#endif +#if defined(HAS_ARGBTOYROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { + ARGBToYRow = ARGBToYRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToYRow = ARGBToYRow_Unaligned_SSSE3; + if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) && + IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { + ARGBToYRow = ARGBToYRow_SSSE3; + } + } + } + +#elif defined(HAS_ARGBTOYROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + ARGBToYRow = ARGBToYRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGBToYRow = ARGBToYRow_NEON; + } + } +#endif + + for (y = 0; y < height; ++y) { + ARGBToUV444Row(src_argb, dst_u, dst_v, width); + ARGBToYRow(src_argb, dst_y, width); + src_argb += src_stride_argb; + dst_y += dst_stride_y; + dst_u += dst_stride_u; + dst_v += dst_stride_v; + } + return 0; +} + +// ARGB little endian (bgra in memory) to I422 +LIBYUV_API +int ARGBToI422(const uint8* src_argb, int src_stride_argb, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height) { + int y; + void (*ARGBToUV422Row)(const uint8* src_argb, uint8* dst_u, uint8* dst_v, + int pix) = ARGBToUV422Row_C; + void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) = + ARGBToYRow_C; + if (!src_argb || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) { + return -1; + } + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } + // Coalesce rows. + if (src_stride_argb == width * 4 && + dst_stride_y == width && + dst_stride_u * 2 == width && + dst_stride_v * 2 == width) { + width *= height; + height = 1; + src_stride_argb = dst_stride_y = dst_stride_u = dst_stride_v = 0; + } +#if defined(HAS_ARGBTOUV422ROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { + ARGBToUV422Row = ARGBToUV422Row_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToUV422Row = ARGBToUV422Row_Unaligned_SSSE3; + if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) { + ARGBToUV422Row = ARGBToUV422Row_SSSE3; + } + } + } +#elif defined(HAS_ARGBTOUV422ROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 16) { + ARGBToUV422Row = ARGBToUV422Row_Any_NEON; + if (IS_ALIGNED(width, 16)) { + ARGBToUV422Row = ARGBToUV422Row_NEON; + } + } +#endif + +#if defined(HAS_ARGBTOYROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { + ARGBToYRow = ARGBToYRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToYRow = ARGBToYRow_Unaligned_SSSE3; + if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) && + IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { + ARGBToYRow = ARGBToYRow_SSSE3; + } + } + } +#elif defined(HAS_ARGBTOYROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + ARGBToYRow = ARGBToYRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGBToYRow = ARGBToYRow_NEON; + } + } +#endif + + for (y = 0; y < height; ++y) { + ARGBToUV422Row(src_argb, dst_u, dst_v, width); + ARGBToYRow(src_argb, dst_y, width); + src_argb += src_stride_argb; + dst_y += dst_stride_y; + dst_u += dst_stride_u; + dst_v += dst_stride_v; + } + return 0; +} + +// ARGB little endian (bgra in memory) to I411 +LIBYUV_API +int ARGBToI411(const uint8* src_argb, int src_stride_argb, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height) { + int y; + void (*ARGBToUV411Row)(const uint8* src_argb, uint8* dst_u, uint8* dst_v, + int pix) = ARGBToUV411Row_C; + void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) = + ARGBToYRow_C; + if (!src_argb || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) { + return -1; + } + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } + // Coalesce rows. + if (src_stride_argb == width * 4 && + dst_stride_y == width && + dst_stride_u * 4 == width && + dst_stride_v * 4 == width) { + width *= height; + height = 1; + src_stride_argb = dst_stride_y = dst_stride_u = dst_stride_v = 0; + } +#if defined(HAS_ARGBTOYROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { + ARGBToYRow = ARGBToYRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToYRow = ARGBToYRow_Unaligned_SSSE3; + if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) && + IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { + ARGBToYRow = ARGBToYRow_SSSE3; + } + } + } +#endif +#if defined(HAS_ARGBTOYROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2) && width >= 32) { + ARGBToYRow = ARGBToYRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ARGBToYRow = ARGBToYRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBTOYROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + ARGBToYRow = ARGBToYRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGBToYRow = ARGBToYRow_NEON; + } + } +#endif +#if defined(HAS_ARGBTOUV411ROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 32) { + ARGBToUV411Row = ARGBToUV411Row_Any_NEON; + if (IS_ALIGNED(width, 32)) { + ARGBToUV411Row = ARGBToUV411Row_NEON; + } + } +#endif + + for (y = 0; y < height; ++y) { + ARGBToUV411Row(src_argb, dst_u, dst_v, width); + ARGBToYRow(src_argb, dst_y, width); + src_argb += src_stride_argb; + dst_y += dst_stride_y; + dst_u += dst_stride_u; + dst_v += dst_stride_v; + } + return 0; +} + +LIBYUV_API +int ARGBToNV12(const uint8* src_argb, int src_stride_argb, + uint8* dst_y, int dst_stride_y, + uint8* dst_uv, int dst_stride_uv, + int width, int height) { + int y; + int halfwidth = (width + 1) >> 1; + void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C; + void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) = + ARGBToYRow_C; + void (*MergeUVRow_)(const uint8* src_u, const uint8* src_v, uint8* dst_uv, + int width) = MergeUVRow_C; + if (!src_argb || + !dst_y || !dst_uv || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } +#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { + ARGBToUVRow = ARGBToUVRow_Any_SSSE3; + ARGBToYRow = ARGBToYRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToUVRow = ARGBToUVRow_Unaligned_SSSE3; + ARGBToYRow = ARGBToYRow_Unaligned_SSSE3; + if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) { + ARGBToUVRow = ARGBToUVRow_SSSE3; + if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { + ARGBToYRow = ARGBToYRow_SSSE3; + } + } + } + } +#elif defined(HAS_ARGBTOYROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + ARGBToYRow = ARGBToYRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGBToYRow = ARGBToYRow_NEON; + } + } +#endif +#if defined(HAS_ARGBTOUVROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 16) { + ARGBToUVRow = ARGBToUVRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + ARGBToUVRow = ARGBToUVRow_NEON; + } + } +#endif +#if defined(HAS_MERGEUVROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && halfwidth >= 16) { + MergeUVRow_ = MergeUVRow_Any_SSE2; + if (IS_ALIGNED(halfwidth, 16)) { + MergeUVRow_ = MergeUVRow_Unaligned_SSE2; + if (IS_ALIGNED(dst_uv, 16) && IS_ALIGNED(dst_stride_uv, 16)) { + MergeUVRow_ = MergeUVRow_SSE2; + } + } + } +#endif +#if defined(HAS_MERGEUVROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2) && halfwidth >= 32) { + MergeUVRow_ = MergeUVRow_Any_AVX2; + if (IS_ALIGNED(halfwidth, 32)) { + MergeUVRow_ = MergeUVRow_AVX2; + } + } +#endif +#if defined(HAS_MERGEUVROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && halfwidth >= 16) { + MergeUVRow_ = MergeUVRow_Any_NEON; + if (IS_ALIGNED(halfwidth, 16)) { + MergeUVRow_ = MergeUVRow_NEON; + } + } +#endif + { + // Allocate a rows of uv. + align_buffer_64(row_u, ((halfwidth + 15) & ~15) * 2); + uint8* row_v = row_u + ((halfwidth + 15) & ~15); + + for (y = 0; y < height - 1; y += 2) { + ARGBToUVRow(src_argb, src_stride_argb, row_u, row_v, width); + MergeUVRow_(row_u, row_v, dst_uv, halfwidth); + ARGBToYRow(src_argb, dst_y, width); + ARGBToYRow(src_argb + src_stride_argb, dst_y + dst_stride_y, width); + src_argb += src_stride_argb * 2; + dst_y += dst_stride_y * 2; + dst_uv += dst_stride_uv; + } + if (height & 1) { + ARGBToUVRow(src_argb, 0, row_u, row_v, width); + MergeUVRow_(row_u, row_v, dst_uv, halfwidth); + ARGBToYRow(src_argb, dst_y, width); + } + free_aligned_buffer_64(row_u); + } + return 0; +} + +// Same as NV12 but U and V swapped. +LIBYUV_API +int ARGBToNV21(const uint8* src_argb, int src_stride_argb, + uint8* dst_y, int dst_stride_y, + uint8* dst_uv, int dst_stride_uv, + int width, int height) { + int y; + int halfwidth = (width + 1) >> 1; + void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C; + void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) = + ARGBToYRow_C; + void (*MergeUVRow_)(const uint8* src_u, const uint8* src_v, uint8* dst_uv, + int width) = MergeUVRow_C; + if (!src_argb || + !dst_y || !dst_uv || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } +#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { + ARGBToUVRow = ARGBToUVRow_Any_SSSE3; + ARGBToYRow = ARGBToYRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToUVRow = ARGBToUVRow_Unaligned_SSSE3; + ARGBToYRow = ARGBToYRow_Unaligned_SSSE3; + if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) { + ARGBToUVRow = ARGBToUVRow_SSSE3; + if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { + ARGBToYRow = ARGBToYRow_SSSE3; + } + } + } + } +#elif defined(HAS_ARGBTOYROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + ARGBToYRow = ARGBToYRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGBToYRow = ARGBToYRow_NEON; + } + } +#endif +#if defined(HAS_ARGBTOUVROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 16) { + ARGBToUVRow = ARGBToUVRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + ARGBToUVRow = ARGBToUVRow_NEON; + } + } +#endif +#if defined(HAS_MERGEUVROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && halfwidth >= 16) { + MergeUVRow_ = MergeUVRow_Any_SSE2; + if (IS_ALIGNED(halfwidth, 16)) { + MergeUVRow_ = MergeUVRow_Unaligned_SSE2; + if (IS_ALIGNED(dst_uv, 16) && IS_ALIGNED(dst_stride_uv, 16)) { + MergeUVRow_ = MergeUVRow_SSE2; + } + } + } +#endif +#if defined(HAS_MERGEUVROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2) && halfwidth >= 32) { + MergeUVRow_ = MergeUVRow_Any_AVX2; + if (IS_ALIGNED(halfwidth, 32)) { + MergeUVRow_ = MergeUVRow_AVX2; + } + } +#endif +#if defined(HAS_MERGEUVROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && halfwidth >= 16) { + MergeUVRow_ = MergeUVRow_Any_NEON; + if (IS_ALIGNED(halfwidth, 16)) { + MergeUVRow_ = MergeUVRow_NEON; + } + } +#endif + { + // Allocate a rows of uv. + align_buffer_64(row_u, ((halfwidth + 15) & ~15) * 2); + uint8* row_v = row_u + ((halfwidth + 15) & ~15); + + for (y = 0; y < height - 1; y += 2) { + ARGBToUVRow(src_argb, src_stride_argb, row_u, row_v, width); + MergeUVRow_(row_v, row_u, dst_uv, halfwidth); + ARGBToYRow(src_argb, dst_y, width); + ARGBToYRow(src_argb + src_stride_argb, dst_y + dst_stride_y, width); + src_argb += src_stride_argb * 2; + dst_y += dst_stride_y * 2; + dst_uv += dst_stride_uv; + } + if (height & 1) { + ARGBToUVRow(src_argb, 0, row_u, row_v, width); + MergeUVRow_(row_v, row_u, dst_uv, halfwidth); + ARGBToYRow(src_argb, dst_y, width); + } + free_aligned_buffer_64(row_u); + } + return 0; +} + +// Convert ARGB to YUY2. +LIBYUV_API +int ARGBToYUY2(const uint8* src_argb, int src_stride_argb, + uint8* dst_yuy2, int dst_stride_yuy2, + int width, int height) { + int y; + void (*ARGBToUV422Row)(const uint8* src_argb, uint8* dst_u, uint8* dst_v, + int pix) = ARGBToUV422Row_C; + void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) = + ARGBToYRow_C; + void (*I422ToYUY2Row)(const uint8* src_y, const uint8* src_u, + const uint8* src_v, uint8* dst_yuy2, int width) = I422ToYUY2Row_C; + + if (!src_argb || !dst_yuy2 || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_yuy2 = dst_yuy2 + (height - 1) * dst_stride_yuy2; + dst_stride_yuy2 = -dst_stride_yuy2; + } + // Coalesce rows. + if (src_stride_argb == width * 4 && + dst_stride_yuy2 == width * 2) { + width *= height; + height = 1; + src_stride_argb = dst_stride_yuy2 = 0; + } +#if defined(HAS_ARGBTOUV422ROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { + ARGBToUV422Row = ARGBToUV422Row_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToUV422Row = ARGBToUV422Row_Unaligned_SSSE3; + if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) { + ARGBToUV422Row = ARGBToUV422Row_SSSE3; + } + } + } +#elif defined(HAS_ARGBTOUV422ROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 16) { + ARGBToUV422Row = ARGBToUV422Row_Any_NEON; + if (IS_ALIGNED(width, 16)) { + ARGBToUV422Row = ARGBToUV422Row_NEON; + } + } +#endif +#if defined(HAS_ARGBTOYROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { + ARGBToYRow = ARGBToYRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToYRow = ARGBToYRow_Unaligned_SSSE3; + if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) { + ARGBToYRow = ARGBToYRow_SSSE3; + } + } + } +#elif defined(HAS_ARGBTOYROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + ARGBToYRow = ARGBToYRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGBToYRow = ARGBToYRow_NEON; + } + } +#endif + +#if defined(HAS_I422TOYUY2ROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && width >= 16) { + I422ToYUY2Row = I422ToYUY2Row_Any_SSE2; + if (IS_ALIGNED(width, 16)) { + I422ToYUY2Row = I422ToYUY2Row_SSE2; + } + } +#elif defined(HAS_I422TOYUY2ROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 16) { + I422ToYUY2Row = I422ToYUY2Row_Any_NEON; + if (IS_ALIGNED(width, 16)) { + I422ToYUY2Row = I422ToYUY2Row_NEON; + } + } +#endif + + { + // Allocate a rows of yuv. + align_buffer_64(row_y, ((width + 63) & ~63) * 2); + uint8* row_u = row_y + ((width + 63) & ~63); + uint8* row_v = row_u + ((width + 63) & ~63) / 2; + + for (y = 0; y < height; ++y) { + ARGBToUV422Row(src_argb, row_u, row_v, width); + ARGBToYRow(src_argb, row_y, width); + I422ToYUY2Row(row_y, row_u, row_v, dst_yuy2, width); + src_argb += src_stride_argb; + dst_yuy2 += dst_stride_yuy2; + } + + free_aligned_buffer_64(row_y); + } + return 0; +} + +// Convert ARGB to UYVY. +LIBYUV_API +int ARGBToUYVY(const uint8* src_argb, int src_stride_argb, + uint8* dst_uyvy, int dst_stride_uyvy, + int width, int height) { + int y; + void (*ARGBToUV422Row)(const uint8* src_argb, uint8* dst_u, uint8* dst_v, + int pix) = ARGBToUV422Row_C; + void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) = + ARGBToYRow_C; + void (*I422ToUYVYRow)(const uint8* src_y, const uint8* src_u, + const uint8* src_v, uint8* dst_uyvy, int width) = I422ToUYVYRow_C; + + if (!src_argb || !dst_uyvy || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_uyvy = dst_uyvy + (height - 1) * dst_stride_uyvy; + dst_stride_uyvy = -dst_stride_uyvy; + } + // Coalesce rows. + if (src_stride_argb == width * 4 && + dst_stride_uyvy == width * 2) { + width *= height; + height = 1; + src_stride_argb = dst_stride_uyvy = 0; + } +#if defined(HAS_ARGBTOUV422ROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { + ARGBToUV422Row = ARGBToUV422Row_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToUV422Row = ARGBToUV422Row_Unaligned_SSSE3; + if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) { + ARGBToUV422Row = ARGBToUV422Row_SSSE3; + } + } + } +#elif defined(HAS_ARGBTOUV422ROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 16) { + ARGBToUV422Row = ARGBToUV422Row_Any_NEON; + if (IS_ALIGNED(width, 16)) { + ARGBToUV422Row = ARGBToUV422Row_NEON; + } + } +#endif +#if defined(HAS_ARGBTOYROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { + ARGBToYRow = ARGBToYRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToYRow = ARGBToYRow_Unaligned_SSSE3; + if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) { + ARGBToYRow = ARGBToYRow_SSSE3; + } + } + } +#elif defined(HAS_ARGBTOYROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + ARGBToYRow = ARGBToYRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGBToYRow = ARGBToYRow_NEON; + } + } +#endif + +#if defined(HAS_I422TOUYVYROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && width >= 16) { + I422ToUYVYRow = I422ToUYVYRow_Any_SSE2; + if (IS_ALIGNED(width, 16)) { + I422ToUYVYRow = I422ToUYVYRow_SSE2; + } + } +#elif defined(HAS_I422TOUYVYROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 16) { + I422ToUYVYRow = I422ToUYVYRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + I422ToUYVYRow = I422ToUYVYRow_NEON; + } + } +#endif + + { + // Allocate a rows of yuv. + align_buffer_64(row_y, ((width + 63) & ~63) * 2); + uint8* row_u = row_y + ((width + 63) & ~63); + uint8* row_v = row_u + ((width + 63) & ~63) / 2; + + for (y = 0; y < height; ++y) { + ARGBToUV422Row(src_argb, row_u, row_v, width); + ARGBToYRow(src_argb, row_y, width); + I422ToUYVYRow(row_y, row_u, row_v, dst_uyvy, width); + src_argb += src_stride_argb; + dst_uyvy += dst_stride_uyvy; + } + + free_aligned_buffer_64(row_y); + } + return 0; +} + +// Convert ARGB to I400. +LIBYUV_API +int ARGBToI400(const uint8* src_argb, int src_stride_argb, + uint8* dst_y, int dst_stride_y, + int width, int height) { + int y; + void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) = + ARGBToYRow_C; + if (!src_argb || !dst_y || width <= 0 || height == 0) { + return -1; + } + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } + // Coalesce rows. + if (src_stride_argb == width * 4 && + dst_stride_y == width) { + width *= height; + height = 1; + src_stride_argb = dst_stride_y = 0; + } +#if defined(HAS_ARGBTOYROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { + ARGBToYRow = ARGBToYRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToYRow = ARGBToYRow_Unaligned_SSSE3; + if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) && + IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { + ARGBToYRow = ARGBToYRow_SSSE3; + } + } + } +#endif +#if defined(HAS_ARGBTOYROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2) && width >= 32) { + ARGBToYRow = ARGBToYRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ARGBToYRow = ARGBToYRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBTOYROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + ARGBToYRow = ARGBToYRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGBToYRow = ARGBToYRow_NEON; + } + } +#endif + + for (y = 0; y < height; ++y) { + ARGBToYRow(src_argb, dst_y, width); + src_argb += src_stride_argb; + dst_y += dst_stride_y; + } + return 0; +} + +// Shuffle table for converting ARGB to RGBA. +static uvec8 kShuffleMaskARGBToRGBA = { + 3u, 0u, 1u, 2u, 7u, 4u, 5u, 6u, 11u, 8u, 9u, 10u, 15u, 12u, 13u, 14u +}; + +// Convert ARGB to RGBA. +LIBYUV_API +int ARGBToRGBA(const uint8* src_argb, int src_stride_argb, + uint8* dst_rgba, int dst_stride_rgba, + int width, int height) { + return ARGBShuffle(src_argb, src_stride_argb, + dst_rgba, dst_stride_rgba, + (const uint8*)(&kShuffleMaskARGBToRGBA), + width, height); +} + +// Convert ARGB To RGB24. +LIBYUV_API +int ARGBToRGB24(const uint8* src_argb, int src_stride_argb, + uint8* dst_rgb24, int dst_stride_rgb24, + int width, int height) { + int y; + void (*ARGBToRGB24Row)(const uint8* src_argb, uint8* dst_rgb, int pix) = + ARGBToRGB24Row_C; + if (!src_argb || !dst_rgb24 || width <= 0 || height == 0) { + return -1; + } + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } + // Coalesce rows. + if (src_stride_argb == width * 4 && + dst_stride_rgb24 == width * 3) { + width *= height; + height = 1; + src_stride_argb = dst_stride_rgb24 = 0; + } +#if defined(HAS_ARGBTORGB24ROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { + ARGBToRGB24Row = ARGBToRGB24Row_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToRGB24Row = ARGBToRGB24Row_SSSE3; + } + } +#elif defined(HAS_ARGBTORGB24ROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + ARGBToRGB24Row = ARGBToRGB24Row_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGBToRGB24Row = ARGBToRGB24Row_NEON; + } + } +#endif + + for (y = 0; y < height; ++y) { + ARGBToRGB24Row(src_argb, dst_rgb24, width); + src_argb += src_stride_argb; + dst_rgb24 += dst_stride_rgb24; + } + return 0; +} + +// Convert ARGB To RAW. +LIBYUV_API +int ARGBToRAW(const uint8* src_argb, int src_stride_argb, + uint8* dst_raw, int dst_stride_raw, + int width, int height) { + int y; + void (*ARGBToRAWRow)(const uint8* src_argb, uint8* dst_rgb, int pix) = + ARGBToRAWRow_C; + if (!src_argb || !dst_raw || width <= 0 || height == 0) { + return -1; + } + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } + // Coalesce rows. + if (src_stride_argb == width * 4 && + dst_stride_raw == width * 3) { + width *= height; + height = 1; + src_stride_argb = dst_stride_raw = 0; + } +#if defined(HAS_ARGBTORAWROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { + ARGBToRAWRow = ARGBToRAWRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToRAWRow = ARGBToRAWRow_SSSE3; + } + } +#elif defined(HAS_ARGBTORAWROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + ARGBToRAWRow = ARGBToRAWRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGBToRAWRow = ARGBToRAWRow_NEON; + } + } +#endif + + for (y = 0; y < height; ++y) { + ARGBToRAWRow(src_argb, dst_raw, width); + src_argb += src_stride_argb; + dst_raw += dst_stride_raw; + } + return 0; +} + +// Convert ARGB To RGB565. +LIBYUV_API +int ARGBToRGB565(const uint8* src_argb, int src_stride_argb, + uint8* dst_rgb565, int dst_stride_rgb565, + int width, int height) { + int y; + void (*ARGBToRGB565Row)(const uint8* src_argb, uint8* dst_rgb, int pix) = + ARGBToRGB565Row_C; + if (!src_argb || !dst_rgb565 || width <= 0 || height == 0) { + return -1; + } + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } + // Coalesce rows. + if (src_stride_argb == width * 4 && + dst_stride_rgb565 == width * 2) { + width *= height; + height = 1; + src_stride_argb = dst_stride_rgb565 = 0; + } +#if defined(HAS_ARGBTORGB565ROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && width >= 4 && + IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) { + ARGBToRGB565Row = ARGBToRGB565Row_Any_SSE2; + if (IS_ALIGNED(width, 4)) { + ARGBToRGB565Row = ARGBToRGB565Row_SSE2; + } + } +#elif defined(HAS_ARGBTORGB565ROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + ARGBToRGB565Row = ARGBToRGB565Row_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGBToRGB565Row = ARGBToRGB565Row_NEON; + } + } +#endif + + for (y = 0; y < height; ++y) { + ARGBToRGB565Row(src_argb, dst_rgb565, width); + src_argb += src_stride_argb; + dst_rgb565 += dst_stride_rgb565; + } + return 0; +} + +// Convert ARGB To ARGB1555. +LIBYUV_API +int ARGBToARGB1555(const uint8* src_argb, int src_stride_argb, + uint8* dst_argb1555, int dst_stride_argb1555, + int width, int height) { + int y; + void (*ARGBToARGB1555Row)(const uint8* src_argb, uint8* dst_rgb, int pix) = + ARGBToARGB1555Row_C; + if (!src_argb || !dst_argb1555 || width <= 0 || height == 0) { + return -1; + } + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } + // Coalesce rows. + if (src_stride_argb == width * 4 && + dst_stride_argb1555 == width * 2) { + width *= height; + height = 1; + src_stride_argb = dst_stride_argb1555 = 0; + } +#if defined(HAS_ARGBTOARGB1555ROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && width >= 4 && + IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) { + ARGBToARGB1555Row = ARGBToARGB1555Row_Any_SSE2; + if (IS_ALIGNED(width, 4)) { + ARGBToARGB1555Row = ARGBToARGB1555Row_SSE2; + } + } +#elif defined(HAS_ARGBTOARGB1555ROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + ARGBToARGB1555Row = ARGBToARGB1555Row_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGBToARGB1555Row = ARGBToARGB1555Row_NEON; + } + } +#endif + + for (y = 0; y < height; ++y) { + ARGBToARGB1555Row(src_argb, dst_argb1555, width); + src_argb += src_stride_argb; + dst_argb1555 += dst_stride_argb1555; + } + return 0; +} + +// Convert ARGB To ARGB4444. +LIBYUV_API +int ARGBToARGB4444(const uint8* src_argb, int src_stride_argb, + uint8* dst_argb4444, int dst_stride_argb4444, + int width, int height) { + int y; + void (*ARGBToARGB4444Row)(const uint8* src_argb, uint8* dst_rgb, int pix) = + ARGBToARGB4444Row_C; + if (!src_argb || !dst_argb4444 || width <= 0 || height == 0) { + return -1; + } + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } + // Coalesce rows. + if (src_stride_argb == width * 4 && + dst_stride_argb4444 == width * 2) { + width *= height; + height = 1; + src_stride_argb = dst_stride_argb4444 = 0; + } +#if defined(HAS_ARGBTOARGB4444ROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && width >= 4 && + IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) { + ARGBToARGB4444Row = ARGBToARGB4444Row_Any_SSE2; + if (IS_ALIGNED(width, 4)) { + ARGBToARGB4444Row = ARGBToARGB4444Row_SSE2; + } + } +#elif defined(HAS_ARGBTOARGB4444ROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + ARGBToARGB4444Row = ARGBToARGB4444Row_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGBToARGB4444Row = ARGBToARGB4444Row_NEON; + } + } +#endif + + for (y = 0; y < height; ++y) { + ARGBToARGB4444Row(src_argb, dst_argb4444, width); + src_argb += src_stride_argb; + dst_argb4444 += dst_stride_argb4444; + } + return 0; +} + +// Convert ARGB to J420. (JPeg full range I420). +LIBYUV_API +int ARGBToJ420(const uint8* src_argb, int src_stride_argb, + uint8* dst_yj, int dst_stride_yj, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height) { + int y; + void (*ARGBToUVJRow)(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width) = ARGBToUVJRow_C; + void (*ARGBToYJRow)(const uint8* src_argb, uint8* dst_yj, int pix) = + ARGBToYJRow_C; + if (!src_argb || + !dst_yj || !dst_u || !dst_v || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } +#if defined(HAS_ARGBTOYJROW_SSSE3) && defined(HAS_ARGBTOUVJROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { + ARGBToUVJRow = ARGBToUVJRow_Any_SSSE3; + ARGBToYJRow = ARGBToYJRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToUVJRow = ARGBToUVJRow_Unaligned_SSSE3; + ARGBToYJRow = ARGBToYJRow_Unaligned_SSSE3; + if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) { + ARGBToUVJRow = ARGBToUVJRow_SSSE3; + if (IS_ALIGNED(dst_yj, 16) && IS_ALIGNED(dst_stride_yj, 16)) { + ARGBToYJRow = ARGBToYJRow_SSSE3; + } + } + } + } +#endif +#if defined(HAS_ARGBTOYJROW_AVX2) && defined(HAS_ARGBTOUVJROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2) && width >= 32) { + ARGBToYJRow = ARGBToYJRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ARGBToYJRow = ARGBToYJRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBTOYJROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + ARGBToYJRow = ARGBToYJRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGBToYJRow = ARGBToYJRow_NEON; + } + } +#endif +#if defined(HAS_ARGBTOUVJROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 16) { + ARGBToUVJRow = ARGBToUVJRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + ARGBToUVJRow = ARGBToUVJRow_NEON; + } + } +#endif + + for (y = 0; y < height - 1; y += 2) { + ARGBToUVJRow(src_argb, src_stride_argb, dst_u, dst_v, width); + ARGBToYJRow(src_argb, dst_yj, width); + ARGBToYJRow(src_argb + src_stride_argb, dst_yj + dst_stride_yj, width); + src_argb += src_stride_argb * 2; + dst_yj += dst_stride_yj * 2; + dst_u += dst_stride_u; + dst_v += dst_stride_v; + } + if (height & 1) { + ARGBToUVJRow(src_argb, 0, dst_u, dst_v, width); + ARGBToYJRow(src_argb, dst_yj, width); + } + return 0; +} + +// Convert ARGB to J400. +LIBYUV_API +int ARGBToJ400(const uint8* src_argb, int src_stride_argb, + uint8* dst_yj, int dst_stride_yj, + int width, int height) { + int y; + void (*ARGBToYJRow)(const uint8* src_argb, uint8* dst_yj, int pix) = + ARGBToYJRow_C; + if (!src_argb || !dst_yj || width <= 0 || height == 0) { + return -1; + } + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } + // Coalesce rows. + if (src_stride_argb == width * 4 && + dst_stride_yj == width) { + width *= height; + height = 1; + src_stride_argb = dst_stride_yj = 0; + } +#if defined(HAS_ARGBTOYJROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { + ARGBToYJRow = ARGBToYJRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToYJRow = ARGBToYJRow_Unaligned_SSSE3; + if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) && + IS_ALIGNED(dst_yj, 16) && IS_ALIGNED(dst_stride_yj, 16)) { + ARGBToYJRow = ARGBToYJRow_SSSE3; + } + } + } +#endif +#if defined(HAS_ARGBTOYJROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2) && width >= 32) { + ARGBToYJRow = ARGBToYJRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ARGBToYJRow = ARGBToYJRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBTOYJROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + ARGBToYJRow = ARGBToYJRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGBToYJRow = ARGBToYJRow_NEON; + } + } +#endif + + for (y = 0; y < height; ++y) { + ARGBToYJRow(src_argb, dst_yj, width); + src_argb += src_stride_argb; + dst_yj += dst_stride_yj; + } + return 0; +} + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/TMessagesProj/jni/libyuv/source/convert_jpeg.cc b/TMessagesProj/jni/libyuv/source/convert_jpeg.cc new file mode 100644 index 000000000..bcb980f7f --- /dev/null +++ b/TMessagesProj/jni/libyuv/source/convert_jpeg.cc @@ -0,0 +1,392 @@ +/* + * Copyright 2011 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/convert.h" + +#ifdef HAVE_JPEG +#include "libyuv/mjpeg_decoder.h" +#endif + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +#ifdef HAVE_JPEG +struct I420Buffers { + uint8* y; + int y_stride; + uint8* u; + int u_stride; + uint8* v; + int v_stride; + int w; + int h; +}; + +static void JpegCopyI420(void* opaque, + const uint8* const* data, + const int* strides, + int rows) { + I420Buffers* dest = (I420Buffers*)(opaque); + I420Copy(data[0], strides[0], + data[1], strides[1], + data[2], strides[2], + dest->y, dest->y_stride, + dest->u, dest->u_stride, + dest->v, dest->v_stride, + dest->w, rows); + dest->y += rows * dest->y_stride; + dest->u += ((rows + 1) >> 1) * dest->u_stride; + dest->v += ((rows + 1) >> 1) * dest->v_stride; + dest->h -= rows; +} + +static void JpegI422ToI420(void* opaque, + const uint8* const* data, + const int* strides, + int rows) { + I420Buffers* dest = (I420Buffers*)(opaque); + I422ToI420(data[0], strides[0], + data[1], strides[1], + data[2], strides[2], + dest->y, dest->y_stride, + dest->u, dest->u_stride, + dest->v, dest->v_stride, + dest->w, rows); + dest->y += rows * dest->y_stride; + dest->u += ((rows + 1) >> 1) * dest->u_stride; + dest->v += ((rows + 1) >> 1) * dest->v_stride; + dest->h -= rows; +} + +static void JpegI444ToI420(void* opaque, + const uint8* const* data, + const int* strides, + int rows) { + I420Buffers* dest = (I420Buffers*)(opaque); + I444ToI420(data[0], strides[0], + data[1], strides[1], + data[2], strides[2], + dest->y, dest->y_stride, + dest->u, dest->u_stride, + dest->v, dest->v_stride, + dest->w, rows); + dest->y += rows * dest->y_stride; + dest->u += ((rows + 1) >> 1) * dest->u_stride; + dest->v += ((rows + 1) >> 1) * dest->v_stride; + dest->h -= rows; +} + +static void JpegI411ToI420(void* opaque, + const uint8* const* data, + const int* strides, + int rows) { + I420Buffers* dest = (I420Buffers*)(opaque); + I411ToI420(data[0], strides[0], + data[1], strides[1], + data[2], strides[2], + dest->y, dest->y_stride, + dest->u, dest->u_stride, + dest->v, dest->v_stride, + dest->w, rows); + dest->y += rows * dest->y_stride; + dest->u += ((rows + 1) >> 1) * dest->u_stride; + dest->v += ((rows + 1) >> 1) * dest->v_stride; + dest->h -= rows; +} + +static void JpegI400ToI420(void* opaque, + const uint8* const* data, + const int* strides, + int rows) { + I420Buffers* dest = (I420Buffers*)(opaque); + I400ToI420(data[0], strides[0], + dest->y, dest->y_stride, + dest->u, dest->u_stride, + dest->v, dest->v_stride, + dest->w, rows); + dest->y += rows * dest->y_stride; + dest->u += ((rows + 1) >> 1) * dest->u_stride; + dest->v += ((rows + 1) >> 1) * dest->v_stride; + dest->h -= rows; +} + +// Query size of MJPG in pixels. +LIBYUV_API +int MJPGSize(const uint8* sample, size_t sample_size, + int* width, int* height) { + MJpegDecoder mjpeg_decoder; + LIBYUV_BOOL ret = mjpeg_decoder.LoadFrame(sample, sample_size); + if (ret) { + *width = mjpeg_decoder.GetWidth(); + *height = mjpeg_decoder.GetHeight(); + } + mjpeg_decoder.UnloadFrame(); + return ret ? 0 : -1; // -1 for runtime failure. +} + +// MJPG (Motion JPeg) to I420 +// TODO(fbarchard): review w and h requirement. dw and dh may be enough. +LIBYUV_API +int MJPGToI420(const uint8* sample, + size_t sample_size, + uint8* y, int y_stride, + uint8* u, int u_stride, + uint8* v, int v_stride, + int w, int h, + int dw, int dh) { + if (sample_size == kUnknownDataSize) { + // ERROR: MJPEG frame size unknown + return -1; + } + + // TODO(fbarchard): Port MJpeg to C. + MJpegDecoder mjpeg_decoder; + LIBYUV_BOOL ret = mjpeg_decoder.LoadFrame(sample, sample_size); + if (ret && (mjpeg_decoder.GetWidth() != w || + mjpeg_decoder.GetHeight() != h)) { + // ERROR: MJPEG frame has unexpected dimensions + mjpeg_decoder.UnloadFrame(); + return 1; // runtime failure + } + if (ret) { + I420Buffers bufs = { y, y_stride, u, u_stride, v, v_stride, dw, dh }; + // YUV420 + if (mjpeg_decoder.GetColorSpace() == + MJpegDecoder::kColorSpaceYCbCr && + mjpeg_decoder.GetNumComponents() == 3 && + mjpeg_decoder.GetVertSampFactor(0) == 2 && + mjpeg_decoder.GetHorizSampFactor(0) == 2 && + mjpeg_decoder.GetVertSampFactor(1) == 1 && + mjpeg_decoder.GetHorizSampFactor(1) == 1 && + mjpeg_decoder.GetVertSampFactor(2) == 1 && + mjpeg_decoder.GetHorizSampFactor(2) == 1) { + ret = mjpeg_decoder.DecodeToCallback(&JpegCopyI420, &bufs, dw, dh); + // YUV422 + } else if (mjpeg_decoder.GetColorSpace() == + MJpegDecoder::kColorSpaceYCbCr && + mjpeg_decoder.GetNumComponents() == 3 && + mjpeg_decoder.GetVertSampFactor(0) == 1 && + mjpeg_decoder.GetHorizSampFactor(0) == 2 && + mjpeg_decoder.GetVertSampFactor(1) == 1 && + mjpeg_decoder.GetHorizSampFactor(1) == 1 && + mjpeg_decoder.GetVertSampFactor(2) == 1 && + mjpeg_decoder.GetHorizSampFactor(2) == 1) { + ret = mjpeg_decoder.DecodeToCallback(&JpegI422ToI420, &bufs, dw, dh); + // YUV444 + } else if (mjpeg_decoder.GetColorSpace() == + MJpegDecoder::kColorSpaceYCbCr && + mjpeg_decoder.GetNumComponents() == 3 && + mjpeg_decoder.GetVertSampFactor(0) == 1 && + mjpeg_decoder.GetHorizSampFactor(0) == 1 && + mjpeg_decoder.GetVertSampFactor(1) == 1 && + mjpeg_decoder.GetHorizSampFactor(1) == 1 && + mjpeg_decoder.GetVertSampFactor(2) == 1 && + mjpeg_decoder.GetHorizSampFactor(2) == 1) { + ret = mjpeg_decoder.DecodeToCallback(&JpegI444ToI420, &bufs, dw, dh); + // YUV411 + } else if (mjpeg_decoder.GetColorSpace() == + MJpegDecoder::kColorSpaceYCbCr && + mjpeg_decoder.GetNumComponents() == 3 && + mjpeg_decoder.GetVertSampFactor(0) == 1 && + mjpeg_decoder.GetHorizSampFactor(0) == 4 && + mjpeg_decoder.GetVertSampFactor(1) == 1 && + mjpeg_decoder.GetHorizSampFactor(1) == 1 && + mjpeg_decoder.GetVertSampFactor(2) == 1 && + mjpeg_decoder.GetHorizSampFactor(2) == 1) { + ret = mjpeg_decoder.DecodeToCallback(&JpegI411ToI420, &bufs, dw, dh); + // YUV400 + } else if (mjpeg_decoder.GetColorSpace() == + MJpegDecoder::kColorSpaceGrayscale && + mjpeg_decoder.GetNumComponents() == 1 && + mjpeg_decoder.GetVertSampFactor(0) == 1 && + mjpeg_decoder.GetHorizSampFactor(0) == 1) { + ret = mjpeg_decoder.DecodeToCallback(&JpegI400ToI420, &bufs, dw, dh); + } else { + // TODO(fbarchard): Implement conversion for any other colorspace/sample + // factors that occur in practice. 411 is supported by libjpeg + // ERROR: Unable to convert MJPEG frame because format is not supported + mjpeg_decoder.UnloadFrame(); + return 1; + } + } + return ret ? 0 : 1; +} + +#ifdef HAVE_JPEG +struct ARGBBuffers { + uint8* argb; + int argb_stride; + int w; + int h; +}; + +static void JpegI420ToARGB(void* opaque, + const uint8* const* data, + const int* strides, + int rows) { + ARGBBuffers* dest = (ARGBBuffers*)(opaque); + I420ToARGB(data[0], strides[0], + data[1], strides[1], + data[2], strides[2], + dest->argb, dest->argb_stride, + dest->w, rows); + dest->argb += rows * dest->argb_stride; + dest->h -= rows; +} + +static void JpegI422ToARGB(void* opaque, + const uint8* const* data, + const int* strides, + int rows) { + ARGBBuffers* dest = (ARGBBuffers*)(opaque); + I422ToARGB(data[0], strides[0], + data[1], strides[1], + data[2], strides[2], + dest->argb, dest->argb_stride, + dest->w, rows); + dest->argb += rows * dest->argb_stride; + dest->h -= rows; +} + +static void JpegI444ToARGB(void* opaque, + const uint8* const* data, + const int* strides, + int rows) { + ARGBBuffers* dest = (ARGBBuffers*)(opaque); + I444ToARGB(data[0], strides[0], + data[1], strides[1], + data[2], strides[2], + dest->argb, dest->argb_stride, + dest->w, rows); + dest->argb += rows * dest->argb_stride; + dest->h -= rows; +} + +static void JpegI411ToARGB(void* opaque, + const uint8* const* data, + const int* strides, + int rows) { + ARGBBuffers* dest = (ARGBBuffers*)(opaque); + I411ToARGB(data[0], strides[0], + data[1], strides[1], + data[2], strides[2], + dest->argb, dest->argb_stride, + dest->w, rows); + dest->argb += rows * dest->argb_stride; + dest->h -= rows; +} + +static void JpegI400ToARGB(void* opaque, + const uint8* const* data, + const int* strides, + int rows) { + ARGBBuffers* dest = (ARGBBuffers*)(opaque); + I400ToARGB(data[0], strides[0], + dest->argb, dest->argb_stride, + dest->w, rows); + dest->argb += rows * dest->argb_stride; + dest->h -= rows; +} + +// MJPG (Motion JPeg) to ARGB +// TODO(fbarchard): review w and h requirement. dw and dh may be enough. +LIBYUV_API +int MJPGToARGB(const uint8* sample, + size_t sample_size, + uint8* argb, int argb_stride, + int w, int h, + int dw, int dh) { + if (sample_size == kUnknownDataSize) { + // ERROR: MJPEG frame size unknown + return -1; + } + + // TODO(fbarchard): Port MJpeg to C. + MJpegDecoder mjpeg_decoder; + LIBYUV_BOOL ret = mjpeg_decoder.LoadFrame(sample, sample_size); + if (ret && (mjpeg_decoder.GetWidth() != w || + mjpeg_decoder.GetHeight() != h)) { + // ERROR: MJPEG frame has unexpected dimensions + mjpeg_decoder.UnloadFrame(); + return 1; // runtime failure + } + if (ret) { + ARGBBuffers bufs = { argb, argb_stride, dw, dh }; + // YUV420 + if (mjpeg_decoder.GetColorSpace() == + MJpegDecoder::kColorSpaceYCbCr && + mjpeg_decoder.GetNumComponents() == 3 && + mjpeg_decoder.GetVertSampFactor(0) == 2 && + mjpeg_decoder.GetHorizSampFactor(0) == 2 && + mjpeg_decoder.GetVertSampFactor(1) == 1 && + mjpeg_decoder.GetHorizSampFactor(1) == 1 && + mjpeg_decoder.GetVertSampFactor(2) == 1 && + mjpeg_decoder.GetHorizSampFactor(2) == 1) { + ret = mjpeg_decoder.DecodeToCallback(&JpegI420ToARGB, &bufs, dw, dh); + // YUV422 + } else if (mjpeg_decoder.GetColorSpace() == + MJpegDecoder::kColorSpaceYCbCr && + mjpeg_decoder.GetNumComponents() == 3 && + mjpeg_decoder.GetVertSampFactor(0) == 1 && + mjpeg_decoder.GetHorizSampFactor(0) == 2 && + mjpeg_decoder.GetVertSampFactor(1) == 1 && + mjpeg_decoder.GetHorizSampFactor(1) == 1 && + mjpeg_decoder.GetVertSampFactor(2) == 1 && + mjpeg_decoder.GetHorizSampFactor(2) == 1) { + ret = mjpeg_decoder.DecodeToCallback(&JpegI422ToARGB, &bufs, dw, dh); + // YUV444 + } else if (mjpeg_decoder.GetColorSpace() == + MJpegDecoder::kColorSpaceYCbCr && + mjpeg_decoder.GetNumComponents() == 3 && + mjpeg_decoder.GetVertSampFactor(0) == 1 && + mjpeg_decoder.GetHorizSampFactor(0) == 1 && + mjpeg_decoder.GetVertSampFactor(1) == 1 && + mjpeg_decoder.GetHorizSampFactor(1) == 1 && + mjpeg_decoder.GetVertSampFactor(2) == 1 && + mjpeg_decoder.GetHorizSampFactor(2) == 1) { + ret = mjpeg_decoder.DecodeToCallback(&JpegI444ToARGB, &bufs, dw, dh); + // YUV411 + } else if (mjpeg_decoder.GetColorSpace() == + MJpegDecoder::kColorSpaceYCbCr && + mjpeg_decoder.GetNumComponents() == 3 && + mjpeg_decoder.GetVertSampFactor(0) == 1 && + mjpeg_decoder.GetHorizSampFactor(0) == 4 && + mjpeg_decoder.GetVertSampFactor(1) == 1 && + mjpeg_decoder.GetHorizSampFactor(1) == 1 && + mjpeg_decoder.GetVertSampFactor(2) == 1 && + mjpeg_decoder.GetHorizSampFactor(2) == 1) { + ret = mjpeg_decoder.DecodeToCallback(&JpegI411ToARGB, &bufs, dw, dh); + // YUV400 + } else if (mjpeg_decoder.GetColorSpace() == + MJpegDecoder::kColorSpaceGrayscale && + mjpeg_decoder.GetNumComponents() == 1 && + mjpeg_decoder.GetVertSampFactor(0) == 1 && + mjpeg_decoder.GetHorizSampFactor(0) == 1) { + ret = mjpeg_decoder.DecodeToCallback(&JpegI400ToARGB, &bufs, dw, dh); + } else { + // TODO(fbarchard): Implement conversion for any other colorspace/sample + // factors that occur in practice. 411 is supported by libjpeg + // ERROR: Unable to convert MJPEG frame because format is not supported + mjpeg_decoder.UnloadFrame(); + return 1; + } + } + return ret ? 0 : 1; +} +#endif + +#endif + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/TMessagesProj/jni/libyuv/source/convert_to_argb.cc b/TMessagesProj/jni/libyuv/source/convert_to_argb.cc new file mode 100644 index 000000000..1b228a7b4 --- /dev/null +++ b/TMessagesProj/jni/libyuv/source/convert_to_argb.cc @@ -0,0 +1,327 @@ +/* + * Copyright 2011 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/convert_argb.h" + +#include "libyuv/cpu_id.h" +#include "libyuv/format_conversion.h" +#ifdef HAVE_JPEG +#include "libyuv/mjpeg_decoder.h" +#endif +#include "libyuv/rotate_argb.h" +#include "libyuv/row.h" +#include "libyuv/video_common.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// Convert camera sample to I420 with cropping, rotation and vertical flip. +// src_width is used for source stride computation +// src_height is used to compute location of planes, and indicate inversion +// sample_size is measured in bytes and is the size of the frame. +// With MJPEG it is the compressed size of the frame. +LIBYUV_API +int ConvertToARGB(const uint8* sample, size_t sample_size, + uint8* crop_argb, int argb_stride, + int crop_x, int crop_y, + int src_width, int src_height, + int crop_width, int crop_height, + enum RotationMode rotation, + uint32 fourcc) { + uint32 format = CanonicalFourCC(fourcc); + int aligned_src_width = (src_width + 1) & ~1; + const uint8* src; + const uint8* src_uv; + int abs_src_height = (src_height < 0) ? -src_height : src_height; + int inv_crop_height = (crop_height < 0) ? -crop_height : crop_height; + int r = 0; + + // One pass rotation is available for some formats. For the rest, convert + // to I420 (with optional vertical flipping) into a temporary I420 buffer, + // and then rotate the I420 to the final destination buffer. + // For in-place conversion, if destination crop_argb is same as source sample, + // also enable temporary buffer. + LIBYUV_BOOL need_buf = (rotation && format != FOURCC_ARGB) || + crop_argb == sample; + uint8* tmp_argb = crop_argb; + int tmp_argb_stride = argb_stride; + uint8* rotate_buffer = NULL; + int abs_crop_height = (crop_height < 0) ? -crop_height : crop_height; + + if (crop_argb == NULL || sample == NULL || + src_width <= 0 || crop_width <= 0 || + src_height == 0 || crop_height == 0) { + return -1; + } + if (src_height < 0) { + inv_crop_height = -inv_crop_height; + } + + if (need_buf) { + int argb_size = crop_width * abs_crop_height * 4; + rotate_buffer = (uint8*)malloc(argb_size); + if (!rotate_buffer) { + return 1; // Out of memory runtime error. + } + crop_argb = rotate_buffer; + argb_stride = crop_width; + } + + switch (format) { + // Single plane formats + case FOURCC_YUY2: + src = sample + (aligned_src_width * crop_y + crop_x) * 2; + r = YUY2ToARGB(src, aligned_src_width * 2, + crop_argb, argb_stride, + crop_width, inv_crop_height); + break; + case FOURCC_UYVY: + src = sample + (aligned_src_width * crop_y + crop_x) * 2; + r = UYVYToARGB(src, aligned_src_width * 2, + crop_argb, argb_stride, + crop_width, inv_crop_height); + break; + case FOURCC_24BG: + src = sample + (src_width * crop_y + crop_x) * 3; + r = RGB24ToARGB(src, src_width * 3, + crop_argb, argb_stride, + crop_width, inv_crop_height); + break; + case FOURCC_RAW: + src = sample + (src_width * crop_y + crop_x) * 3; + r = RAWToARGB(src, src_width * 3, + crop_argb, argb_stride, + crop_width, inv_crop_height); + break; + case FOURCC_ARGB: + src = sample + (src_width * crop_y + crop_x) * 4; + r = ARGBToARGB(src, src_width * 4, + crop_argb, argb_stride, + crop_width, inv_crop_height); + break; + case FOURCC_BGRA: + src = sample + (src_width * crop_y + crop_x) * 4; + r = BGRAToARGB(src, src_width * 4, + crop_argb, argb_stride, + crop_width, inv_crop_height); + break; + case FOURCC_ABGR: + src = sample + (src_width * crop_y + crop_x) * 4; + r = ABGRToARGB(src, src_width * 4, + crop_argb, argb_stride, + crop_width, inv_crop_height); + break; + case FOURCC_RGBA: + src = sample + (src_width * crop_y + crop_x) * 4; + r = RGBAToARGB(src, src_width * 4, + crop_argb, argb_stride, + crop_width, inv_crop_height); + break; + case FOURCC_RGBP: + src = sample + (src_width * crop_y + crop_x) * 2; + r = RGB565ToARGB(src, src_width * 2, + crop_argb, argb_stride, + crop_width, inv_crop_height); + break; + case FOURCC_RGBO: + src = sample + (src_width * crop_y + crop_x) * 2; + r = ARGB1555ToARGB(src, src_width * 2, + crop_argb, argb_stride, + crop_width, inv_crop_height); + break; + case FOURCC_R444: + src = sample + (src_width * crop_y + crop_x) * 2; + r = ARGB4444ToARGB(src, src_width * 2, + crop_argb, argb_stride, + crop_width, inv_crop_height); + break; + // TODO(fbarchard): Support cropping Bayer by odd numbers + // by adjusting fourcc. + case FOURCC_BGGR: + src = sample + (src_width * crop_y + crop_x); + r = BayerBGGRToARGB(src, src_width, + crop_argb, argb_stride, + crop_width, inv_crop_height); + break; + + case FOURCC_GBRG: + src = sample + (src_width * crop_y + crop_x); + r = BayerGBRGToARGB(src, src_width, + crop_argb, argb_stride, + crop_width, inv_crop_height); + break; + + case FOURCC_GRBG: + src = sample + (src_width * crop_y + crop_x); + r = BayerGRBGToARGB(src, src_width, + crop_argb, argb_stride, + crop_width, inv_crop_height); + break; + + case FOURCC_RGGB: + src = sample + (src_width * crop_y + crop_x); + r = BayerRGGBToARGB(src, src_width, + crop_argb, argb_stride, + crop_width, inv_crop_height); + break; + + case FOURCC_I400: + src = sample + src_width * crop_y + crop_x; + r = I400ToARGB(src, src_width, + crop_argb, argb_stride, + crop_width, inv_crop_height); + break; + + // Biplanar formats + case FOURCC_NV12: + src = sample + (src_width * crop_y + crop_x); + src_uv = sample + aligned_src_width * (src_height + crop_y / 2) + crop_x; + r = NV12ToARGB(src, src_width, + src_uv, aligned_src_width, + crop_argb, argb_stride, + crop_width, inv_crop_height); + break; + case FOURCC_NV21: + src = sample + (src_width * crop_y + crop_x); + src_uv = sample + aligned_src_width * (src_height + crop_y / 2) + crop_x; + // Call NV12 but with u and v parameters swapped. + r = NV21ToARGB(src, src_width, + src_uv, aligned_src_width, + crop_argb, argb_stride, + crop_width, inv_crop_height); + break; + case FOURCC_M420: + src = sample + (src_width * crop_y) * 12 / 8 + crop_x; + r = M420ToARGB(src, src_width, + crop_argb, argb_stride, + crop_width, inv_crop_height); + break; +// case FOURCC_Q420: +// src = sample + (src_width + aligned_src_width * 2) * crop_y + crop_x; +// src_uv = sample + (src_width + aligned_src_width * 2) * crop_y + +// src_width + crop_x * 2; +// r = Q420ToARGB(src, src_width * 3, +// src_uv, src_width * 3, +// crop_argb, argb_stride, +// crop_width, inv_crop_height); +// break; + // Triplanar formats + case FOURCC_I420: + case FOURCC_YU12: + case FOURCC_YV12: { + const uint8* src_y = sample + (src_width * crop_y + crop_x); + const uint8* src_u; + const uint8* src_v; + int halfwidth = (src_width + 1) / 2; + int halfheight = (abs_src_height + 1) / 2; + if (format == FOURCC_YV12) { + src_v = sample + src_width * abs_src_height + + (halfwidth * crop_y + crop_x) / 2; + src_u = sample + src_width * abs_src_height + + halfwidth * (halfheight + crop_y / 2) + crop_x / 2; + } else { + src_u = sample + src_width * abs_src_height + + (halfwidth * crop_y + crop_x) / 2; + src_v = sample + src_width * abs_src_height + + halfwidth * (halfheight + crop_y / 2) + crop_x / 2; + } + r = I420ToARGB(src_y, src_width, + src_u, halfwidth, + src_v, halfwidth, + crop_argb, argb_stride, + crop_width, inv_crop_height); + break; + } + case FOURCC_I422: + case FOURCC_YV16: { + const uint8* src_y = sample + src_width * crop_y + crop_x; + const uint8* src_u; + const uint8* src_v; + int halfwidth = (src_width + 1) / 2; + if (format == FOURCC_YV16) { + src_v = sample + src_width * abs_src_height + + halfwidth * crop_y + crop_x / 2; + src_u = sample + src_width * abs_src_height + + halfwidth * (abs_src_height + crop_y) + crop_x / 2; + } else { + src_u = sample + src_width * abs_src_height + + halfwidth * crop_y + crop_x / 2; + src_v = sample + src_width * abs_src_height + + halfwidth * (abs_src_height + crop_y) + crop_x / 2; + } + r = I422ToARGB(src_y, src_width, + src_u, halfwidth, + src_v, halfwidth, + crop_argb, argb_stride, + crop_width, inv_crop_height); + break; + } + case FOURCC_I444: + case FOURCC_YV24: { + const uint8* src_y = sample + src_width * crop_y + crop_x; + const uint8* src_u; + const uint8* src_v; + if (format == FOURCC_YV24) { + src_v = sample + src_width * (abs_src_height + crop_y) + crop_x; + src_u = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x; + } else { + src_u = sample + src_width * (abs_src_height + crop_y) + crop_x; + src_v = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x; + } + r = I444ToARGB(src_y, src_width, + src_u, src_width, + src_v, src_width, + crop_argb, argb_stride, + crop_width, inv_crop_height); + break; + } + case FOURCC_I411: { + int quarterwidth = (src_width + 3) / 4; + const uint8* src_y = sample + src_width * crop_y + crop_x; + const uint8* src_u = sample + src_width * abs_src_height + + quarterwidth * crop_y + crop_x / 4; + const uint8* src_v = sample + src_width * abs_src_height + + quarterwidth * (abs_src_height + crop_y) + crop_x / 4; + r = I411ToARGB(src_y, src_width, + src_u, quarterwidth, + src_v, quarterwidth, + crop_argb, argb_stride, + crop_width, inv_crop_height); + break; + } +#ifdef HAVE_JPEG + case FOURCC_MJPG: + r = MJPGToARGB(sample, sample_size, + crop_argb, argb_stride, + src_width, abs_src_height, crop_width, inv_crop_height); + break; +#endif + default: + r = -1; // unknown fourcc - return failure code. + } + + if (need_buf) { + if (!r) { + r = ARGBRotate(crop_argb, argb_stride, + tmp_argb, tmp_argb_stride, + crop_width, abs_crop_height, rotation); + } + free(rotate_buffer); + } + + return r; +} + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/TMessagesProj/jni/libyuv/source/convert_to_i420.cc b/TMessagesProj/jni/libyuv/source/convert_to_i420.cc new file mode 100644 index 000000000..7b194fff7 --- /dev/null +++ b/TMessagesProj/jni/libyuv/source/convert_to_i420.cc @@ -0,0 +1,383 @@ +/* + * Copyright 2011 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "libyuv/convert.h" + +#include "libyuv/format_conversion.h" +#include "libyuv/video_common.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// Convert camera sample to I420 with cropping, rotation and vertical flip. +// src_width is used for source stride computation +// src_height is used to compute location of planes, and indicate inversion +// sample_size is measured in bytes and is the size of the frame. +// With MJPEG it is the compressed size of the frame. +LIBYUV_API +int ConvertToI420(const uint8* sample, + size_t sample_size, + uint8* y, int y_stride, + uint8* u, int u_stride, + uint8* v, int v_stride, + int crop_x, int crop_y, + int src_width, int src_height, + int crop_width, int crop_height, + enum RotationMode rotation, + uint32 fourcc) { + uint32 format = CanonicalFourCC(fourcc); + int aligned_src_width = (src_width + 1) & ~1; + const uint8* src; + const uint8* src_uv; + int abs_src_height = (src_height < 0) ? -src_height : src_height; + int inv_crop_height = (crop_height < 0) ? -crop_height : crop_height; + int r = 0; + LIBYUV_BOOL need_buf = (rotation && format != FOURCC_I420 && + format != FOURCC_NV12 && format != FOURCC_NV21 && + format != FOURCC_YU12 && format != FOURCC_YV12) || y == sample; + uint8* tmp_y = y; + uint8* tmp_u = u; + uint8* tmp_v = v; + int tmp_y_stride = y_stride; + int tmp_u_stride = u_stride; + int tmp_v_stride = v_stride; + uint8* rotate_buffer = NULL; + int abs_crop_height = (crop_height < 0) ? -crop_height : crop_height; + + if (!y || !u || !v || !sample || + src_width <= 0 || crop_width <= 0 || + src_height == 0 || crop_height == 0) { + return -1; + } + if (src_height < 0) { + inv_crop_height = -inv_crop_height; + } + + // One pass rotation is available for some formats. For the rest, convert + // to I420 (with optional vertical flipping) into a temporary I420 buffer, + // and then rotate the I420 to the final destination buffer. + // For in-place conversion, if destination y is same as source sample, + // also enable temporary buffer. + if (need_buf) { + int y_size = crop_width * abs_crop_height; + int uv_size = ((crop_width + 1) / 2) * ((abs_crop_height + 1) / 2); + rotate_buffer = (uint8*)malloc(y_size + uv_size * 2); + if (!rotate_buffer) { + return 1; // Out of memory runtime error. + } + y = rotate_buffer; + u = y + y_size; + v = u + uv_size; + y_stride = crop_width; + u_stride = v_stride = ((crop_width + 1) / 2); + } + + switch (format) { + // Single plane formats + case FOURCC_YUY2: + src = sample + (aligned_src_width * crop_y + crop_x) * 2; + r = YUY2ToI420(src, aligned_src_width * 2, + y, y_stride, + u, u_stride, + v, v_stride, + crop_width, inv_crop_height); + break; + case FOURCC_UYVY: + src = sample + (aligned_src_width * crop_y + crop_x) * 2; + r = UYVYToI420(src, aligned_src_width * 2, + y, y_stride, + u, u_stride, + v, v_stride, + crop_width, inv_crop_height); + break; + case FOURCC_RGBP: + src = sample + (src_width * crop_y + crop_x) * 2; + r = RGB565ToI420(src, src_width * 2, + y, y_stride, + u, u_stride, + v, v_stride, + crop_width, inv_crop_height); + break; + case FOURCC_RGBO: + src = sample + (src_width * crop_y + crop_x) * 2; + r = ARGB1555ToI420(src, src_width * 2, + y, y_stride, + u, u_stride, + v, v_stride, + crop_width, inv_crop_height); + break; + case FOURCC_R444: + src = sample + (src_width * crop_y + crop_x) * 2; + r = ARGB4444ToI420(src, src_width * 2, + y, y_stride, + u, u_stride, + v, v_stride, + crop_width, inv_crop_height); + break; + case FOURCC_24BG: + src = sample + (src_width * crop_y + crop_x) * 3; + r = RGB24ToI420(src, src_width * 3, + y, y_stride, + u, u_stride, + v, v_stride, + crop_width, inv_crop_height); + break; + case FOURCC_RAW: + src = sample + (src_width * crop_y + crop_x) * 3; + r = RAWToI420(src, src_width * 3, + y, y_stride, + u, u_stride, + v, v_stride, + crop_width, inv_crop_height); + break; + case FOURCC_ARGB: + src = sample + (src_width * crop_y + crop_x) * 4; + r = ARGBToI420(src, src_width * 4, + y, y_stride, + u, u_stride, + v, v_stride, + crop_width, inv_crop_height); + break; + case FOURCC_BGRA: + src = sample + (src_width * crop_y + crop_x) * 4; + r = BGRAToI420(src, src_width * 4, + y, y_stride, + u, u_stride, + v, v_stride, + crop_width, inv_crop_height); + break; + case FOURCC_ABGR: + src = sample + (src_width * crop_y + crop_x) * 4; + r = ABGRToI420(src, src_width * 4, + y, y_stride, + u, u_stride, + v, v_stride, + crop_width, inv_crop_height); + break; + case FOURCC_RGBA: + src = sample + (src_width * crop_y + crop_x) * 4; + r = RGBAToI420(src, src_width * 4, + y, y_stride, + u, u_stride, + v, v_stride, + crop_width, inv_crop_height); + break; + // TODO(fbarchard): Support cropping Bayer by odd numbers + // by adjusting fourcc. + case FOURCC_BGGR: + src = sample + (src_width * crop_y + crop_x); + r = BayerBGGRToI420(src, src_width, + y, y_stride, + u, u_stride, + v, v_stride, + crop_width, inv_crop_height); + break; + case FOURCC_GBRG: + src = sample + (src_width * crop_y + crop_x); + r = BayerGBRGToI420(src, src_width, + y, y_stride, + u, u_stride, + v, v_stride, + crop_width, inv_crop_height); + break; + case FOURCC_GRBG: + src = sample + (src_width * crop_y + crop_x); + r = BayerGRBGToI420(src, src_width, + y, y_stride, + u, u_stride, + v, v_stride, + crop_width, inv_crop_height); + break; + case FOURCC_RGGB: + src = sample + (src_width * crop_y + crop_x); + r = BayerRGGBToI420(src, src_width, + y, y_stride, + u, u_stride, + v, v_stride, + crop_width, inv_crop_height); + break; + case FOURCC_I400: + src = sample + src_width * crop_y + crop_x; + r = I400ToI420(src, src_width, + y, y_stride, + u, u_stride, + v, v_stride, + crop_width, inv_crop_height); + break; + // Biplanar formats + case FOURCC_NV12: + src = sample + (src_width * crop_y + crop_x); + src_uv = sample + aligned_src_width * (src_height + crop_y / 2) + crop_x; + r = NV12ToI420Rotate(src, src_width, + src_uv, aligned_src_width, + y, y_stride, + u, u_stride, + v, v_stride, + crop_width, inv_crop_height, rotation); + break; + case FOURCC_NV21: + src = sample + (src_width * crop_y + crop_x); + src_uv = sample + aligned_src_width * (src_height + crop_y / 2) + crop_x; + // Call NV12 but with u and v parameters swapped. + r = NV12ToI420Rotate(src, src_width, + src_uv, aligned_src_width, + y, y_stride, + v, v_stride, + u, u_stride, + crop_width, inv_crop_height, rotation); + break; + case FOURCC_M420: + src = sample + (src_width * crop_y) * 12 / 8 + crop_x; + r = M420ToI420(src, src_width, + y, y_stride, + u, u_stride, + v, v_stride, + crop_width, inv_crop_height); + break; + case FOURCC_Q420: + src = sample + (src_width + aligned_src_width * 2) * crop_y + crop_x; + src_uv = sample + (src_width + aligned_src_width * 2) * crop_y + + src_width + crop_x * 2; + r = Q420ToI420(src, src_width * 3, + src_uv, src_width * 3, + y, y_stride, + u, u_stride, + v, v_stride, + crop_width, inv_crop_height); + break; + // Triplanar formats + case FOURCC_I420: + case FOURCC_YU12: + case FOURCC_YV12: { + const uint8* src_y = sample + (src_width * crop_y + crop_x); + const uint8* src_u; + const uint8* src_v; + int halfwidth = (src_width + 1) / 2; + int halfheight = (abs_src_height + 1) / 2; + if (format == FOURCC_YV12) { + src_v = sample + src_width * abs_src_height + + (halfwidth * crop_y + crop_x) / 2; + src_u = sample + src_width * abs_src_height + + halfwidth * (halfheight + crop_y / 2) + crop_x / 2; + } else { + src_u = sample + src_width * abs_src_height + + (halfwidth * crop_y + crop_x) / 2; + src_v = sample + src_width * abs_src_height + + halfwidth * (halfheight + crop_y / 2) + crop_x / 2; + } + r = I420Rotate(src_y, src_width, + src_u, halfwidth, + src_v, halfwidth, + y, y_stride, + u, u_stride, + v, v_stride, + crop_width, inv_crop_height, rotation); + break; + } + case FOURCC_I422: + case FOURCC_YV16: { + const uint8* src_y = sample + src_width * crop_y + crop_x; + const uint8* src_u; + const uint8* src_v; + int halfwidth = (src_width + 1) / 2; + if (format == FOURCC_YV16) { + src_v = sample + src_width * abs_src_height + + halfwidth * crop_y + crop_x / 2; + src_u = sample + src_width * abs_src_height + + halfwidth * (abs_src_height + crop_y) + crop_x / 2; + } else { + src_u = sample + src_width * abs_src_height + + halfwidth * crop_y + crop_x / 2; + src_v = sample + src_width * abs_src_height + + halfwidth * (abs_src_height + crop_y) + crop_x / 2; + } + r = I422ToI420(src_y, src_width, + src_u, halfwidth, + src_v, halfwidth, + y, y_stride, + u, u_stride, + v, v_stride, + crop_width, inv_crop_height); + break; + } + case FOURCC_I444: + case FOURCC_YV24: { + const uint8* src_y = sample + src_width * crop_y + crop_x; + const uint8* src_u; + const uint8* src_v; + if (format == FOURCC_YV24) { + src_v = sample + src_width * (abs_src_height + crop_y) + crop_x; + src_u = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x; + } else { + src_u = sample + src_width * (abs_src_height + crop_y) + crop_x; + src_v = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x; + } + r = I444ToI420(src_y, src_width, + src_u, src_width, + src_v, src_width, + y, y_stride, + u, u_stride, + v, v_stride, + crop_width, inv_crop_height); + break; + } + case FOURCC_I411: { + int quarterwidth = (src_width + 3) / 4; + const uint8* src_y = sample + src_width * crop_y + crop_x; + const uint8* src_u = sample + src_width * abs_src_height + + quarterwidth * crop_y + crop_x / 4; + const uint8* src_v = sample + src_width * abs_src_height + + quarterwidth * (abs_src_height + crop_y) + crop_x / 4; + r = I411ToI420(src_y, src_width, + src_u, quarterwidth, + src_v, quarterwidth, + y, y_stride, + u, u_stride, + v, v_stride, + crop_width, inv_crop_height); + break; + } +#ifdef HAVE_JPEG + case FOURCC_MJPG: + r = MJPGToI420(sample, sample_size, + y, y_stride, + u, u_stride, + v, v_stride, + src_width, abs_src_height, crop_width, inv_crop_height); + break; +#endif + default: + r = -1; // unknown fourcc - return failure code. + } + + if (need_buf) { + if (!r) { + r = I420Rotate(y, y_stride, + u, u_stride, + v, v_stride, + tmp_y, tmp_y_stride, + tmp_u, tmp_u_stride, + tmp_v, tmp_v_stride, + crop_width, abs_crop_height, rotation); + } + free(rotate_buffer); + } + + return r; +} + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/TMessagesProj/jni/libyuv/source/cpu_id.cc b/TMessagesProj/jni/libyuv/source/cpu_id.cc new file mode 100644 index 000000000..deb4c4465 --- /dev/null +++ b/TMessagesProj/jni/libyuv/source/cpu_id.cc @@ -0,0 +1,293 @@ +/* + * Copyright 2011 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/cpu_id.h" + +#if defined(_MSC_VER) && !defined(__clang__) +#include // For __cpuidex() +#endif +#if !defined(__pnacl__) && !defined(__CLR_VER) && \ + !defined(__native_client__) && \ + defined(_MSC_VER) && (_MSC_FULL_VER >= 160040219) +#include // For _xgetbv() +#endif + +#if !defined(__native_client__) +#include // For getenv() +#endif + +// For ArmCpuCaps() but unittested on all platforms +#include +#include + +#include "libyuv/basic_types.h" // For CPU_X86 + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// For functions that use the stack and have runtime checks for overflow, +// use SAFEBUFFERS to avoid additional check. +#if defined(_MSC_VER) && (_MSC_FULL_VER >= 160040219) +#define SAFEBUFFERS __declspec(safebuffers) +#else +#define SAFEBUFFERS +#endif + +// Low level cpuid for X86. Returns zeros on other CPUs. +#if !defined(__pnacl__) && !defined(__CLR_VER) && \ + (defined(_M_IX86) || defined(_M_X64) || \ + defined(__i386__) || defined(__x86_64__)) +LIBYUV_API +void CpuId(uint32 info_eax, uint32 info_ecx, uint32* cpu_info) { +#if defined(_MSC_VER) && !defined(__clang__) +#if (_MSC_FULL_VER >= 160040219) + __cpuidex((int*)(cpu_info), info_eax, info_ecx); +#elif defined(_M_IX86) + __asm { + mov eax, info_eax + mov ecx, info_ecx + mov edi, cpu_info + cpuid + mov [edi], eax + mov [edi + 4], ebx + mov [edi + 8], ecx + mov [edi + 12], edx + } +#else + if (info_ecx == 0) { + __cpuid((int*)(cpu_info), info_eax); + } else { + cpu_info[3] = cpu_info[2] = cpu_info[1] = cpu_info[0] = 0; + } +#endif +#else // defined(_MSC_VER) + uint32 info_ebx, info_edx; + asm volatile ( // NOLINT +#if defined( __i386__) && defined(__PIC__) + // Preserve ebx for fpic 32 bit. + "mov %%ebx, %%edi \n" + "cpuid \n" + "xchg %%edi, %%ebx \n" + : "=D" (info_ebx), +#else + "cpuid \n" + : "=b" (info_ebx), +#endif // defined( __i386__) && defined(__PIC__) + "+a" (info_eax), "+c" (info_ecx), "=d" (info_edx)); + cpu_info[0] = info_eax; + cpu_info[1] = info_ebx; + cpu_info[2] = info_ecx; + cpu_info[3] = info_edx; +#endif // defined(_MSC_VER) +} + +#if !defined(__native_client__) +#define HAS_XGETBV +// X86 CPUs have xgetbv to detect OS saves high parts of ymm registers. +int TestOsSaveYmm() { + uint32 xcr0 = 0u; +#if defined(_MSC_VER) && (_MSC_FULL_VER >= 160040219) + xcr0 = (uint32)(_xgetbv(0)); // VS2010 SP1 required. +#elif defined(_M_IX86) && defined(_MSC_VER) + __asm { + xor ecx, ecx // xcr 0 + _asm _emit 0x0f _asm _emit 0x01 _asm _emit 0xd0 // For VS2010 and earlier. + mov xcr0, eax + } +#elif defined(__i386__) || defined(__x86_64__) + asm(".byte 0x0f, 0x01, 0xd0" : "=a" (xcr0) : "c" (0) : "%edx"); +#endif // defined(_MSC_VER) + return((xcr0 & 6) == 6); // Is ymm saved? +} +#endif // !defined(__native_client__) +#else +LIBYUV_API +void CpuId(uint32 eax, uint32 ecx, uint32* cpu_info) { + cpu_info[0] = cpu_info[1] = cpu_info[2] = cpu_info[3] = 0; +} +#endif + +// based on libvpx arm_cpudetect.c +// For Arm, but public to allow testing on any CPU +LIBYUV_API SAFEBUFFERS +int ArmCpuCaps(const char* cpuinfo_name) { + char cpuinfo_line[512]; + FILE* f = fopen(cpuinfo_name, "r"); + if (!f) { + // Assume Neon if /proc/cpuinfo is unavailable. + // This will occur for Chrome sandbox for Pepper or Render process. + return kCpuHasNEON; + } + while (fgets(cpuinfo_line, sizeof(cpuinfo_line) - 1, f)) { + if (memcmp(cpuinfo_line, "Features", 8) == 0) { + char* p = strstr(cpuinfo_line, " neon"); + if (p && (p[5] == ' ' || p[5] == '\n')) { + fclose(f); + return kCpuHasNEON; + } + } + } + fclose(f); + return 0; +} + +#if defined(__mips__) && defined(__linux__) +static int MipsCpuCaps(const char* search_string) { + char cpuinfo_line[512]; + const char* file_name = "/proc/cpuinfo"; + FILE* f = fopen(file_name, "r"); + if (!f) { + // Assume DSP if /proc/cpuinfo is unavailable. + // This will occur for Chrome sandbox for Pepper or Render process. + return kCpuHasMIPS_DSP; + } + while (fgets(cpuinfo_line, sizeof(cpuinfo_line) - 1, f) != NULL) { + if (strstr(cpuinfo_line, search_string) != NULL) { + fclose(f); + return kCpuHasMIPS_DSP; + } + } + fclose(f); + return 0; +} +#endif + +// CPU detect function for SIMD instruction sets. +LIBYUV_API +int cpu_info_ = kCpuInit; // cpu_info is not initialized yet. + +// Test environment variable for disabling CPU features. Any non-zero value +// to disable. Zero ignored to make it easy to set the variable on/off. +#if !defined(__native_client__) && !defined(_M_ARM) + +static LIBYUV_BOOL TestEnv(const char* name) { + const char* var = getenv(name); + if (var) { + if (var[0] != '0') { + return LIBYUV_TRUE; + } + } + return LIBYUV_FALSE; +} +#else // nacl does not support getenv(). +static LIBYUV_BOOL TestEnv(const char*) { + return LIBYUV_FALSE; +} +#endif + +LIBYUV_API SAFEBUFFERS +int InitCpuFlags(void) { +#if !defined(__pnacl__) && !defined(__CLR_VER) && defined(CPU_X86) + + uint32 cpu_info0[4] = { 0, 0, 0, 0 }; + uint32 cpu_info1[4] = { 0, 0, 0, 0 }; + uint32 cpu_info7[4] = { 0, 0, 0, 0 }; + CpuId(0, 0, cpu_info0); + CpuId(1, 0, cpu_info1); + if (cpu_info0[0] >= 7) { + CpuId(7, 0, cpu_info7); + } + cpu_info_ = ((cpu_info1[3] & 0x04000000) ? kCpuHasSSE2 : 0) | + ((cpu_info1[2] & 0x00000200) ? kCpuHasSSSE3 : 0) | + ((cpu_info1[2] & 0x00080000) ? kCpuHasSSE41 : 0) | + ((cpu_info1[2] & 0x00100000) ? kCpuHasSSE42 : 0) | + ((cpu_info7[1] & 0x00000200) ? kCpuHasERMS : 0) | + ((cpu_info1[2] & 0x00001000) ? kCpuHasFMA3 : 0) | + kCpuHasX86; + +#ifdef HAS_XGETBV + if ((cpu_info1[2] & 0x18000000) == 0x18000000 && // AVX and OSSave + TestOsSaveYmm()) { // Saves YMM. + cpu_info_ |= ((cpu_info7[1] & 0x00000020) ? kCpuHasAVX2 : 0) | + kCpuHasAVX; + } +#endif + // Environment variable overrides for testing. + if (TestEnv("LIBYUV_DISABLE_X86")) { + cpu_info_ &= ~kCpuHasX86; + } + if (TestEnv("LIBYUV_DISABLE_SSE2")) { + cpu_info_ &= ~kCpuHasSSE2; + } + if (TestEnv("LIBYUV_DISABLE_SSSE3")) { + cpu_info_ &= ~kCpuHasSSSE3; + } + if (TestEnv("LIBYUV_DISABLE_SSE41")) { + cpu_info_ &= ~kCpuHasSSE41; + } + if (TestEnv("LIBYUV_DISABLE_SSE42")) { + cpu_info_ &= ~kCpuHasSSE42; + } + if (TestEnv("LIBYUV_DISABLE_AVX")) { + cpu_info_ &= ~kCpuHasAVX; + } + if (TestEnv("LIBYUV_DISABLE_AVX2")) { + cpu_info_ &= ~kCpuHasAVX2; + } + if (TestEnv("LIBYUV_DISABLE_ERMS")) { + cpu_info_ &= ~kCpuHasERMS; + } + if (TestEnv("LIBYUV_DISABLE_FMA3")) { + cpu_info_ &= ~kCpuHasFMA3; + } +#elif defined(__mips__) && defined(__linux__) + // Linux mips parse text file for dsp detect. + cpu_info_ = MipsCpuCaps("dsp"); // set kCpuHasMIPS_DSP. +#if defined(__mips_dspr2) + cpu_info_ |= kCpuHasMIPS_DSPR2; +#endif + cpu_info_ |= kCpuHasMIPS; + + if (getenv("LIBYUV_DISABLE_MIPS")) { + cpu_info_ &= ~kCpuHasMIPS; + } + if (getenv("LIBYUV_DISABLE_MIPS_DSP")) { + cpu_info_ &= ~kCpuHasMIPS_DSP; + } + if (getenv("LIBYUV_DISABLE_MIPS_DSPR2")) { + cpu_info_ &= ~kCpuHasMIPS_DSPR2; + } +#elif defined(__arm__) || defined(__aarch64__) +// gcc -mfpu=neon defines __ARM_NEON__ +// __ARM_NEON__ generates code that requires Neon. NaCL also requires Neon. +// For Linux, /proc/cpuinfo can be tested but without that assume Neon. +#if defined(__ARM_NEON__) || defined(__native_client__) || !defined(__linux__) + cpu_info_ = kCpuHasNEON; +// For aarch64(arm64), /proc/cpuinfo's feature is not complete, e.g. no neon +// flag in it. +// So for aarch64, neon enabling is hard coded here. +#elif defined(__aarch64__) + cpu_info_ = kCpuHasNEON; +#else + // Linux arm parse text file for neon detect. + cpu_info_ = ArmCpuCaps("/proc/cpuinfo"); +#endif + cpu_info_ |= kCpuHasARM; + if (TestEnv("LIBYUV_DISABLE_NEON")) { + cpu_info_ &= ~kCpuHasNEON; + } +#endif // __arm__ + if (TestEnv("LIBYUV_DISABLE_ASM")) { + cpu_info_ = 0; + } + return cpu_info_; +} + +LIBYUV_API +void MaskCpuFlags(int enable_flags) { + cpu_info_ = InitCpuFlags() & enable_flags; +} + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/TMessagesProj/jni/libyuv/source/format_conversion.cc b/TMessagesProj/jni/libyuv/source/format_conversion.cc new file mode 100644 index 000000000..3c1737153 --- /dev/null +++ b/TMessagesProj/jni/libyuv/source/format_conversion.cc @@ -0,0 +1,554 @@ +/* + * Copyright 2011 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/format_conversion.h" + +#include "libyuv/basic_types.h" +#include "libyuv/cpu_id.h" +#include "libyuv/video_common.h" +#include "libyuv/row.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// generate a selector mask useful for pshufb +static uint32 GenerateSelector(int select0, int select1) { + return (uint32)(select0) | + (uint32)((select1 + 4) << 8) | + (uint32)((select0 + 8) << 16) | + (uint32)((select1 + 12) << 24); +} + +static int MakeSelectors(const int blue_index, + const int green_index, + const int red_index, + uint32 dst_fourcc_bayer, + uint32* index_map) { + // Now build a lookup table containing the indices for the four pixels in each + // 2x2 Bayer grid. + switch (dst_fourcc_bayer) { + case FOURCC_BGGR: + index_map[0] = GenerateSelector(blue_index, green_index); + index_map[1] = GenerateSelector(green_index, red_index); + break; + case FOURCC_GBRG: + index_map[0] = GenerateSelector(green_index, blue_index); + index_map[1] = GenerateSelector(red_index, green_index); + break; + case FOURCC_RGGB: + index_map[0] = GenerateSelector(red_index, green_index); + index_map[1] = GenerateSelector(green_index, blue_index); + break; + case FOURCC_GRBG: + index_map[0] = GenerateSelector(green_index, red_index); + index_map[1] = GenerateSelector(blue_index, green_index); + break; + default: + return -1; // Bad FourCC + } + return 0; +} + +// Converts 32 bit ARGB to Bayer RGB formats. +LIBYUV_API +int ARGBToBayer(const uint8* src_argb, int src_stride_argb, + uint8* dst_bayer, int dst_stride_bayer, + int width, int height, + uint32 dst_fourcc_bayer) { + int y; + const int blue_index = 0; // Offsets for ARGB format + const int green_index = 1; + const int red_index = 2; + uint32 index_map[2]; + void (*ARGBToBayerRow)(const uint8* src_argb, uint8* dst_bayer, + uint32 selector, int pix) = ARGBToBayerRow_C; + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } +#if defined(HAS_ARGBTOBAYERROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && width >= 8 && + IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) { + ARGBToBayerRow = ARGBToBayerRow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + ARGBToBayerRow = ARGBToBayerRow_SSSE3; + } + } +#elif defined(HAS_ARGBTOBAYERROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + ARGBToBayerRow = ARGBToBayerRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGBToBayerRow = ARGBToBayerRow_NEON; + } + } +#endif + if (MakeSelectors(blue_index, green_index, red_index, + dst_fourcc_bayer, index_map)) { + return -1; // Bad FourCC + } + + for (y = 0; y < height; ++y) { + ARGBToBayerRow(src_argb, dst_bayer, index_map[y & 1], width); + src_argb += src_stride_argb; + dst_bayer += dst_stride_bayer; + } + return 0; +} + +#define AVG(a, b) (((a) + (b)) >> 1) + +static void BayerRowBG(const uint8* src_bayer0, int src_stride_bayer, + uint8* dst_argb, int pix) { + const uint8* src_bayer1 = src_bayer0 + src_stride_bayer; + uint8 g = src_bayer0[1]; + uint8 r = src_bayer1[1]; + int x; + for (x = 0; x < pix - 2; x += 2) { + dst_argb[0] = src_bayer0[0]; + dst_argb[1] = AVG(g, src_bayer0[1]); + dst_argb[2] = AVG(r, src_bayer1[1]); + dst_argb[3] = 255U; + dst_argb[4] = AVG(src_bayer0[0], src_bayer0[2]); + dst_argb[5] = src_bayer0[1]; + dst_argb[6] = src_bayer1[1]; + dst_argb[7] = 255U; + g = src_bayer0[1]; + r = src_bayer1[1]; + src_bayer0 += 2; + src_bayer1 += 2; + dst_argb += 8; + } + dst_argb[0] = src_bayer0[0]; + dst_argb[1] = AVG(g, src_bayer0[1]); + dst_argb[2] = AVG(r, src_bayer1[1]); + dst_argb[3] = 255U; + if (!(pix & 1)) { + dst_argb[4] = src_bayer0[0]; + dst_argb[5] = src_bayer0[1]; + dst_argb[6] = src_bayer1[1]; + dst_argb[7] = 255U; + } +} + +static void BayerRowRG(const uint8* src_bayer0, int src_stride_bayer, + uint8* dst_argb, int pix) { + const uint8* src_bayer1 = src_bayer0 + src_stride_bayer; + uint8 g = src_bayer0[1]; + uint8 b = src_bayer1[1]; + int x; + for (x = 0; x < pix - 2; x += 2) { + dst_argb[0] = AVG(b, src_bayer1[1]); + dst_argb[1] = AVG(g, src_bayer0[1]); + dst_argb[2] = src_bayer0[0]; + dst_argb[3] = 255U; + dst_argb[4] = src_bayer1[1]; + dst_argb[5] = src_bayer0[1]; + dst_argb[6] = AVG(src_bayer0[0], src_bayer0[2]); + dst_argb[7] = 255U; + g = src_bayer0[1]; + b = src_bayer1[1]; + src_bayer0 += 2; + src_bayer1 += 2; + dst_argb += 8; + } + dst_argb[0] = AVG(b, src_bayer1[1]); + dst_argb[1] = AVG(g, src_bayer0[1]); + dst_argb[2] = src_bayer0[0]; + dst_argb[3] = 255U; + if (!(pix & 1)) { + dst_argb[4] = src_bayer1[1]; + dst_argb[5] = src_bayer0[1]; + dst_argb[6] = src_bayer0[0]; + dst_argb[7] = 255U; + } +} + +static void BayerRowGB(const uint8* src_bayer0, int src_stride_bayer, + uint8* dst_argb, int pix) { + const uint8* src_bayer1 = src_bayer0 + src_stride_bayer; + uint8 b = src_bayer0[1]; + int x; + for (x = 0; x < pix - 2; x += 2) { + dst_argb[0] = AVG(b, src_bayer0[1]); + dst_argb[1] = src_bayer0[0]; + dst_argb[2] = src_bayer1[0]; + dst_argb[3] = 255U; + dst_argb[4] = src_bayer0[1]; + dst_argb[5] = AVG(src_bayer0[0], src_bayer0[2]); + dst_argb[6] = AVG(src_bayer1[0], src_bayer1[2]); + dst_argb[7] = 255U; + b = src_bayer0[1]; + src_bayer0 += 2; + src_bayer1 += 2; + dst_argb += 8; + } + dst_argb[0] = AVG(b, src_bayer0[1]); + dst_argb[1] = src_bayer0[0]; + dst_argb[2] = src_bayer1[0]; + dst_argb[3] = 255U; + if (!(pix & 1)) { + dst_argb[4] = src_bayer0[1]; + dst_argb[5] = src_bayer0[0]; + dst_argb[6] = src_bayer1[0]; + dst_argb[7] = 255U; + } +} + +static void BayerRowGR(const uint8* src_bayer0, int src_stride_bayer, + uint8* dst_argb, int pix) { + const uint8* src_bayer1 = src_bayer0 + src_stride_bayer; + uint8 r = src_bayer0[1]; + int x; + for (x = 0; x < pix - 2; x += 2) { + dst_argb[0] = src_bayer1[0]; + dst_argb[1] = src_bayer0[0]; + dst_argb[2] = AVG(r, src_bayer0[1]); + dst_argb[3] = 255U; + dst_argb[4] = AVG(src_bayer1[0], src_bayer1[2]); + dst_argb[5] = AVG(src_bayer0[0], src_bayer0[2]); + dst_argb[6] = src_bayer0[1]; + dst_argb[7] = 255U; + r = src_bayer0[1]; + src_bayer0 += 2; + src_bayer1 += 2; + dst_argb += 8; + } + dst_argb[0] = src_bayer1[0]; + dst_argb[1] = src_bayer0[0]; + dst_argb[2] = AVG(r, src_bayer0[1]); + dst_argb[3] = 255U; + if (!(pix & 1)) { + dst_argb[4] = src_bayer1[0]; + dst_argb[5] = src_bayer0[0]; + dst_argb[6] = src_bayer0[1]; + dst_argb[7] = 255U; + } +} + +// Converts any Bayer RGB format to ARGB. +LIBYUV_API +int BayerToARGB(const uint8* src_bayer, int src_stride_bayer, + uint8* dst_argb, int dst_stride_argb, + int width, int height, + uint32 src_fourcc_bayer) { + int y; + void (*BayerRow0)(const uint8* src_bayer, int src_stride_bayer, + uint8* dst_argb, int pix); + void (*BayerRow1)(const uint8* src_bayer, int src_stride_bayer, + uint8* dst_argb, int pix); + if (height < 0) { + height = -height; + dst_argb = dst_argb + (height - 1) * dst_stride_argb; + dst_stride_argb = -dst_stride_argb; + } + switch (src_fourcc_bayer) { + case FOURCC_BGGR: + BayerRow0 = BayerRowBG; + BayerRow1 = BayerRowGR; + break; + case FOURCC_GBRG: + BayerRow0 = BayerRowGB; + BayerRow1 = BayerRowRG; + break; + case FOURCC_GRBG: + BayerRow0 = BayerRowGR; + BayerRow1 = BayerRowBG; + break; + case FOURCC_RGGB: + BayerRow0 = BayerRowRG; + BayerRow1 = BayerRowGB; + break; + default: + return -1; // Bad FourCC + } + + for (y = 0; y < height - 1; y += 2) { + BayerRow0(src_bayer, src_stride_bayer, dst_argb, width); + BayerRow1(src_bayer + src_stride_bayer, -src_stride_bayer, + dst_argb + dst_stride_argb, width); + src_bayer += src_stride_bayer * 2; + dst_argb += dst_stride_argb * 2; + } + if (height & 1) { + BayerRow0(src_bayer, src_stride_bayer, dst_argb, width); + } + return 0; +} + +// Converts any Bayer RGB format to ARGB. +LIBYUV_API +int BayerToI420(const uint8* src_bayer, int src_stride_bayer, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height, + uint32 src_fourcc_bayer) { + void (*BayerRow0)(const uint8* src_bayer, int src_stride_bayer, + uint8* dst_argb, int pix); + void (*BayerRow1)(const uint8* src_bayer, int src_stride_bayer, + uint8* dst_argb, int pix); + + void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C; + void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) = + ARGBToYRow_C; + // Negative height means invert the image. + if (height < 0) { + int halfheight; + height = -height; + halfheight = (height + 1) >> 1; + dst_y = dst_y + (height - 1) * dst_stride_y; + dst_u = dst_u + (halfheight - 1) * dst_stride_u; + dst_v = dst_v + (halfheight - 1) * dst_stride_v; + dst_stride_y = -dst_stride_y; + dst_stride_u = -dst_stride_u; + dst_stride_v = -dst_stride_v; + } +#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { + ARGBToUVRow = ARGBToUVRow_Any_SSSE3; + ARGBToYRow = ARGBToYRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToYRow = ARGBToYRow_Unaligned_SSSE3; + ARGBToUVRow = ARGBToUVRow_SSSE3; + if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { + ARGBToYRow = ARGBToYRow_SSSE3; + } + } + } +#elif defined(HAS_ARGBTOYROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + ARGBToYRow = ARGBToYRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGBToYRow = ARGBToYRow_NEON; + } + } +#endif +#if defined(HAS_ARGBTOUVROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 16) { + ARGBToUVRow = ARGBToUVRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + ARGBToUVRow = ARGBToUVRow_NEON; + } + } +#endif + + switch (src_fourcc_bayer) { + case FOURCC_BGGR: + BayerRow0 = BayerRowBG; + BayerRow1 = BayerRowGR; + break; + case FOURCC_GBRG: + BayerRow0 = BayerRowGB; + BayerRow1 = BayerRowRG; + break; + case FOURCC_GRBG: + BayerRow0 = BayerRowGR; + BayerRow1 = BayerRowBG; + break; + case FOURCC_RGGB: + BayerRow0 = BayerRowRG; + BayerRow1 = BayerRowGB; + break; + default: + return -1; // Bad FourCC + } + + { + // Allocate 2 rows of ARGB. + const int kRowSize = (width * 4 + 15) & ~15; + align_buffer_64(row, kRowSize * 2); + int y; + for (y = 0; y < height - 1; y += 2) { + BayerRow0(src_bayer, src_stride_bayer, row, width); + BayerRow1(src_bayer + src_stride_bayer, -src_stride_bayer, + row + kRowSize, width); + ARGBToUVRow(row, kRowSize, dst_u, dst_v, width); + ARGBToYRow(row, dst_y, width); + ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width); + src_bayer += src_stride_bayer * 2; + dst_y += dst_stride_y * 2; + dst_u += dst_stride_u; + dst_v += dst_stride_v; + } + if (height & 1) { + BayerRow0(src_bayer, src_stride_bayer, row, width); + ARGBToUVRow(row, 0, dst_u, dst_v, width); + ARGBToYRow(row, dst_y, width); + } + free_aligned_buffer_64(row); + } + return 0; +} + +// Convert I420 to Bayer. +LIBYUV_API +int I420ToBayer(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_bayer, int dst_stride_bayer, + int width, int height, + uint32 dst_fourcc_bayer) { + void (*I422ToARGBRow)(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width) = I422ToARGBRow_C; + void (*ARGBToBayerRow)(const uint8* src_argb, uint8* dst_bayer, + uint32 selector, int pix) = ARGBToBayerRow_C; + const int blue_index = 0; // Offsets for ARGB format + const int green_index = 1; + const int red_index = 2; + uint32 index_map[2]; + // Negative height means invert the image. + if (height < 0) { + int halfheight; + height = -height; + halfheight = (height + 1) >> 1; + src_y = src_y + (height - 1) * src_stride_y; + src_u = src_u + (halfheight - 1) * src_stride_u; + src_v = src_v + (halfheight - 1) * src_stride_v; + src_stride_y = -src_stride_y; + src_stride_u = -src_stride_u; + src_stride_v = -src_stride_v; + } +#if defined(HAS_I422TOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { + I422ToARGBRow = I422ToARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I422ToARGBRow = I422ToARGBRow_SSSE3; + } + } +#endif +#if defined(HAS_I422TOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2) && width >= 16) { + I422ToARGBRow = I422ToARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + I422ToARGBRow = I422ToARGBRow_AVX2; + } + } +#endif +#if defined(HAS_I422TOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + I422ToARGBRow = I422ToARGBRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + I422ToARGBRow = I422ToARGBRow_NEON; + } + } +#endif +#if defined(HAS_I422TOARGBROW_MIPS_DSPR2) + if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) && + IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) && + IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) && + IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2)) { + I422ToARGBRow = I422ToARGBRow_MIPS_DSPR2; + } +#endif + +#if defined(HAS_ARGBTOBAYERROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { + ARGBToBayerRow = ARGBToBayerRow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + ARGBToBayerRow = ARGBToBayerRow_SSSE3; + } + } +#elif defined(HAS_ARGBTOBAYERROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + ARGBToBayerRow = ARGBToBayerRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGBToBayerRow = ARGBToBayerRow_NEON; + } + } +#endif + + if (MakeSelectors(blue_index, green_index, red_index, + dst_fourcc_bayer, index_map)) { + return -1; // Bad FourCC + } + { + // Allocate a row of ARGB. + align_buffer_64(row, width * 4); + int y; + for (y = 0; y < height; ++y) { + I422ToARGBRow(src_y, src_u, src_v, row, width); + ARGBToBayerRow(row, dst_bayer, index_map[y & 1], width); + dst_bayer += dst_stride_bayer; + src_y += src_stride_y; + if (y & 1) { + src_u += src_stride_u; + src_v += src_stride_v; + } + } + free_aligned_buffer_64(row); + } + return 0; +} + +#define MAKEBAYERFOURCC(BAYER) \ +LIBYUV_API \ +int Bayer##BAYER##ToI420(const uint8* src_bayer, int src_stride_bayer, \ + uint8* dst_y, int dst_stride_y, \ + uint8* dst_u, int dst_stride_u, \ + uint8* dst_v, int dst_stride_v, \ + int width, int height) { \ + return BayerToI420(src_bayer, src_stride_bayer, \ + dst_y, dst_stride_y, \ + dst_u, dst_stride_u, \ + dst_v, dst_stride_v, \ + width, height, \ + FOURCC_##BAYER); \ +} \ + \ +LIBYUV_API \ +int I420ToBayer##BAYER(const uint8* src_y, int src_stride_y, \ + const uint8* src_u, int src_stride_u, \ + const uint8* src_v, int src_stride_v, \ + uint8* dst_bayer, int dst_stride_bayer, \ + int width, int height) { \ + return I420ToBayer(src_y, src_stride_y, \ + src_u, src_stride_u, \ + src_v, src_stride_v, \ + dst_bayer, dst_stride_bayer, \ + width, height, \ + FOURCC_##BAYER); \ +} \ + \ +LIBYUV_API \ +int ARGBToBayer##BAYER(const uint8* src_argb, int src_stride_argb, \ + uint8* dst_bayer, int dst_stride_bayer, \ + int width, int height) { \ + return ARGBToBayer(src_argb, src_stride_argb, \ + dst_bayer, dst_stride_bayer, \ + width, height, \ + FOURCC_##BAYER); \ +} \ + \ +LIBYUV_API \ +int Bayer##BAYER##ToARGB(const uint8* src_bayer, int src_stride_bayer, \ + uint8* dst_argb, int dst_stride_argb, \ + int width, int height) { \ + return BayerToARGB(src_bayer, src_stride_bayer, \ + dst_argb, dst_stride_argb, \ + width, height, \ + FOURCC_##BAYER); \ +} + +MAKEBAYERFOURCC(BGGR) +MAKEBAYERFOURCC(GBRG) +MAKEBAYERFOURCC(GRBG) +MAKEBAYERFOURCC(RGGB) + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/TMessagesProj/jni/libyuv/source/mjpeg_decoder.cc b/TMessagesProj/jni/libyuv/source/mjpeg_decoder.cc new file mode 100644 index 000000000..36028c3cc --- /dev/null +++ b/TMessagesProj/jni/libyuv/source/mjpeg_decoder.cc @@ -0,0 +1,566 @@ +/* + * Copyright 2012 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/mjpeg_decoder.h" + +#ifdef HAVE_JPEG +#include + +#if !defined(__pnacl__) && !defined(__CLR_VER) && \ + !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR) +// Must be included before jpeglib. +#include +#define HAVE_SETJMP +#endif +struct FILE; // For jpeglib.h. + +// C++ build requires extern C for jpeg internals. +#ifdef __cplusplus +extern "C" { +#endif + +#include + +#ifdef __cplusplus +} // extern "C" +#endif + +#include "libyuv/planar_functions.h" // For CopyPlane(). + +namespace libyuv { + +#ifdef HAVE_SETJMP +struct SetJmpErrorMgr { + jpeg_error_mgr base; // Must be at the top + jmp_buf setjmp_buffer; +}; +#endif + +const int MJpegDecoder::kColorSpaceUnknown = JCS_UNKNOWN; +const int MJpegDecoder::kColorSpaceGrayscale = JCS_GRAYSCALE; +const int MJpegDecoder::kColorSpaceRgb = JCS_RGB; +const int MJpegDecoder::kColorSpaceYCbCr = JCS_YCbCr; +const int MJpegDecoder::kColorSpaceCMYK = JCS_CMYK; +const int MJpegDecoder::kColorSpaceYCCK = JCS_YCCK; + +// Methods that are passed to jpeglib. +boolean fill_input_buffer(jpeg_decompress_struct* cinfo); +void init_source(jpeg_decompress_struct* cinfo); +void skip_input_data(jpeg_decompress_struct* cinfo, + long num_bytes); // NOLINT +void term_source(jpeg_decompress_struct* cinfo); +void ErrorHandler(jpeg_common_struct* cinfo); + +MJpegDecoder::MJpegDecoder() + : has_scanline_padding_(LIBYUV_FALSE), + num_outbufs_(0), + scanlines_(NULL), + scanlines_sizes_(NULL), + databuf_(NULL), + databuf_strides_(NULL) { + decompress_struct_ = new jpeg_decompress_struct; + source_mgr_ = new jpeg_source_mgr; +#ifdef HAVE_SETJMP + error_mgr_ = new SetJmpErrorMgr; + decompress_struct_->err = jpeg_std_error(&error_mgr_->base); + // Override standard exit()-based error handler. + error_mgr_->base.error_exit = &ErrorHandler; +#endif + decompress_struct_->client_data = NULL; + source_mgr_->init_source = &init_source; + source_mgr_->fill_input_buffer = &fill_input_buffer; + source_mgr_->skip_input_data = &skip_input_data; + source_mgr_->resync_to_restart = &jpeg_resync_to_restart; + source_mgr_->term_source = &term_source; + jpeg_create_decompress(decompress_struct_); + decompress_struct_->src = source_mgr_; + buf_vec_.buffers = &buf_; + buf_vec_.len = 1; +} + +MJpegDecoder::~MJpegDecoder() { + jpeg_destroy_decompress(decompress_struct_); + delete decompress_struct_; + delete source_mgr_; +#ifdef HAVE_SETJMP + delete error_mgr_; +#endif + DestroyOutputBuffers(); +} + +LIBYUV_BOOL MJpegDecoder::LoadFrame(const uint8* src, size_t src_len) { + if (!ValidateJpeg(src, src_len)) { + return LIBYUV_FALSE; + } + + buf_.data = src; + buf_.len = static_cast(src_len); + buf_vec_.pos = 0; + decompress_struct_->client_data = &buf_vec_; +#ifdef HAVE_SETJMP + if (setjmp(error_mgr_->setjmp_buffer)) { + // We called jpeg_read_header, it experienced an error, and we called + // longjmp() and rewound the stack to here. Return error. + return LIBYUV_FALSE; + } +#endif + if (jpeg_read_header(decompress_struct_, TRUE) != JPEG_HEADER_OK) { + // ERROR: Bad MJPEG header + return LIBYUV_FALSE; + } + AllocOutputBuffers(GetNumComponents()); + for (int i = 0; i < num_outbufs_; ++i) { + int scanlines_size = GetComponentScanlinesPerImcuRow(i); + if (scanlines_sizes_[i] != scanlines_size) { + if (scanlines_[i]) { + delete scanlines_[i]; + } + scanlines_[i] = new uint8* [scanlines_size]; + scanlines_sizes_[i] = scanlines_size; + } + + // We allocate padding for the final scanline to pad it up to DCTSIZE bytes + // to avoid memory errors, since jpeglib only reads full MCUs blocks. For + // the preceding scanlines, the padding is not needed/wanted because the + // following addresses will already be valid (they are the initial bytes of + // the next scanline) and will be overwritten when jpeglib writes out that + // next scanline. + int databuf_stride = GetComponentStride(i); + int databuf_size = scanlines_size * databuf_stride; + if (databuf_strides_[i] != databuf_stride) { + if (databuf_[i]) { + delete databuf_[i]; + } + databuf_[i] = new uint8[databuf_size]; + databuf_strides_[i] = databuf_stride; + } + + if (GetComponentStride(i) != GetComponentWidth(i)) { + has_scanline_padding_ = LIBYUV_TRUE; + } + } + return LIBYUV_TRUE; +} + +static int DivideAndRoundUp(int numerator, int denominator) { + return (numerator + denominator - 1) / denominator; +} + +static int DivideAndRoundDown(int numerator, int denominator) { + return numerator / denominator; +} + +// Returns width of the last loaded frame. +int MJpegDecoder::GetWidth() { + return decompress_struct_->image_width; +} + +// Returns height of the last loaded frame. +int MJpegDecoder::GetHeight() { + return decompress_struct_->image_height; +} + +// Returns format of the last loaded frame. The return value is one of the +// kColorSpace* constants. +int MJpegDecoder::GetColorSpace() { + return decompress_struct_->jpeg_color_space; +} + +// Number of color components in the color space. +int MJpegDecoder::GetNumComponents() { + return decompress_struct_->num_components; +} + +// Sample factors of the n-th component. +int MJpegDecoder::GetHorizSampFactor(int component) { + return decompress_struct_->comp_info[component].h_samp_factor; +} + +int MJpegDecoder::GetVertSampFactor(int component) { + return decompress_struct_->comp_info[component].v_samp_factor; +} + +int MJpegDecoder::GetHorizSubSampFactor(int component) { + return decompress_struct_->max_h_samp_factor / + GetHorizSampFactor(component); +} + +int MJpegDecoder::GetVertSubSampFactor(int component) { + return decompress_struct_->max_v_samp_factor / + GetVertSampFactor(component); +} + +int MJpegDecoder::GetImageScanlinesPerImcuRow() { + return decompress_struct_->max_v_samp_factor * DCTSIZE; +} + +int MJpegDecoder::GetComponentScanlinesPerImcuRow(int component) { + int vs = GetVertSubSampFactor(component); + return DivideAndRoundUp(GetImageScanlinesPerImcuRow(), vs); +} + +int MJpegDecoder::GetComponentWidth(int component) { + int hs = GetHorizSubSampFactor(component); + return DivideAndRoundUp(GetWidth(), hs); +} + +int MJpegDecoder::GetComponentHeight(int component) { + int vs = GetVertSubSampFactor(component); + return DivideAndRoundUp(GetHeight(), vs); +} + +// Get width in bytes padded out to a multiple of DCTSIZE +int MJpegDecoder::GetComponentStride(int component) { + return (GetComponentWidth(component) + DCTSIZE - 1) & ~(DCTSIZE - 1); +} + +int MJpegDecoder::GetComponentSize(int component) { + return GetComponentWidth(component) * GetComponentHeight(component); +} + +LIBYUV_BOOL MJpegDecoder::UnloadFrame() { +#ifdef HAVE_SETJMP + if (setjmp(error_mgr_->setjmp_buffer)) { + // We called jpeg_abort_decompress, it experienced an error, and we called + // longjmp() and rewound the stack to here. Return error. + return LIBYUV_FALSE; + } +#endif + jpeg_abort_decompress(decompress_struct_); + return LIBYUV_TRUE; +} + +// TODO(fbarchard): Allow rectangle to be specified: x, y, width, height. +LIBYUV_BOOL MJpegDecoder::DecodeToBuffers( + uint8** planes, int dst_width, int dst_height) { + if (dst_width != GetWidth() || + dst_height > GetHeight()) { + // ERROR: Bad dimensions + return LIBYUV_FALSE; + } +#ifdef HAVE_SETJMP + if (setjmp(error_mgr_->setjmp_buffer)) { + // We called into jpeglib, it experienced an error sometime during this + // function call, and we called longjmp() and rewound the stack to here. + // Return error. + return LIBYUV_FALSE; + } +#endif + if (!StartDecode()) { + return LIBYUV_FALSE; + } + SetScanlinePointers(databuf_); + int lines_left = dst_height; + // Compute amount of lines to skip to implement vertical crop. + // TODO(fbarchard): Ensure skip is a multiple of maximum component + // subsample. ie 2 + int skip = (GetHeight() - dst_height) / 2; + if (skip > 0) { + // There is no API to skip lines in the output data, so we read them + // into the temp buffer. + while (skip >= GetImageScanlinesPerImcuRow()) { + if (!DecodeImcuRow()) { + FinishDecode(); + return LIBYUV_FALSE; + } + skip -= GetImageScanlinesPerImcuRow(); + } + if (skip > 0) { + // Have a partial iMCU row left over to skip. Must read it and then + // copy the parts we want into the destination. + if (!DecodeImcuRow()) { + FinishDecode(); + return LIBYUV_FALSE; + } + for (int i = 0; i < num_outbufs_; ++i) { + // TODO(fbarchard): Compute skip to avoid this + assert(skip % GetVertSubSampFactor(i) == 0); + int rows_to_skip = + DivideAndRoundDown(skip, GetVertSubSampFactor(i)); + int scanlines_to_copy = GetComponentScanlinesPerImcuRow(i) - + rows_to_skip; + int data_to_skip = rows_to_skip * GetComponentStride(i); + CopyPlane(databuf_[i] + data_to_skip, GetComponentStride(i), + planes[i], GetComponentWidth(i), + GetComponentWidth(i), scanlines_to_copy); + planes[i] += scanlines_to_copy * GetComponentWidth(i); + } + lines_left -= (GetImageScanlinesPerImcuRow() - skip); + } + } + + // Read full MCUs but cropped horizontally + for (; lines_left > GetImageScanlinesPerImcuRow(); + lines_left -= GetImageScanlinesPerImcuRow()) { + if (!DecodeImcuRow()) { + FinishDecode(); + return LIBYUV_FALSE; + } + for (int i = 0; i < num_outbufs_; ++i) { + int scanlines_to_copy = GetComponentScanlinesPerImcuRow(i); + CopyPlane(databuf_[i], GetComponentStride(i), + planes[i], GetComponentWidth(i), + GetComponentWidth(i), scanlines_to_copy); + planes[i] += scanlines_to_copy * GetComponentWidth(i); + } + } + + if (lines_left > 0) { + // Have a partial iMCU row left over to decode. + if (!DecodeImcuRow()) { + FinishDecode(); + return LIBYUV_FALSE; + } + for (int i = 0; i < num_outbufs_; ++i) { + int scanlines_to_copy = + DivideAndRoundUp(lines_left, GetVertSubSampFactor(i)); + CopyPlane(databuf_[i], GetComponentStride(i), + planes[i], GetComponentWidth(i), + GetComponentWidth(i), scanlines_to_copy); + planes[i] += scanlines_to_copy * GetComponentWidth(i); + } + } + return FinishDecode(); +} + +LIBYUV_BOOL MJpegDecoder::DecodeToCallback(CallbackFunction fn, void* opaque, + int dst_width, int dst_height) { + if (dst_width != GetWidth() || + dst_height > GetHeight()) { + // ERROR: Bad dimensions + return LIBYUV_FALSE; + } +#ifdef HAVE_SETJMP + if (setjmp(error_mgr_->setjmp_buffer)) { + // We called into jpeglib, it experienced an error sometime during this + // function call, and we called longjmp() and rewound the stack to here. + // Return error. + return LIBYUV_FALSE; + } +#endif + if (!StartDecode()) { + return LIBYUV_FALSE; + } + SetScanlinePointers(databuf_); + int lines_left = dst_height; + // TODO(fbarchard): Compute amount of lines to skip to implement vertical crop + int skip = (GetHeight() - dst_height) / 2; + if (skip > 0) { + while (skip >= GetImageScanlinesPerImcuRow()) { + if (!DecodeImcuRow()) { + FinishDecode(); + return LIBYUV_FALSE; + } + skip -= GetImageScanlinesPerImcuRow(); + } + if (skip > 0) { + // Have a partial iMCU row left over to skip. + if (!DecodeImcuRow()) { + FinishDecode(); + return LIBYUV_FALSE; + } + for (int i = 0; i < num_outbufs_; ++i) { + // TODO(fbarchard): Compute skip to avoid this + assert(skip % GetVertSubSampFactor(i) == 0); + int rows_to_skip = DivideAndRoundDown(skip, GetVertSubSampFactor(i)); + int data_to_skip = rows_to_skip * GetComponentStride(i); + // Change our own data buffer pointers so we can pass them to the + // callback. + databuf_[i] += data_to_skip; + } + int scanlines_to_copy = GetImageScanlinesPerImcuRow() - skip; + (*fn)(opaque, databuf_, databuf_strides_, scanlines_to_copy); + // Now change them back. + for (int i = 0; i < num_outbufs_; ++i) { + int rows_to_skip = DivideAndRoundDown(skip, GetVertSubSampFactor(i)); + int data_to_skip = rows_to_skip * GetComponentStride(i); + databuf_[i] -= data_to_skip; + } + lines_left -= scanlines_to_copy; + } + } + // Read full MCUs until we get to the crop point. + for (; lines_left >= GetImageScanlinesPerImcuRow(); + lines_left -= GetImageScanlinesPerImcuRow()) { + if (!DecodeImcuRow()) { + FinishDecode(); + return LIBYUV_FALSE; + } + (*fn)(opaque, databuf_, databuf_strides_, GetImageScanlinesPerImcuRow()); + } + if (lines_left > 0) { + // Have a partial iMCU row left over to decode. + if (!DecodeImcuRow()) { + FinishDecode(); + return LIBYUV_FALSE; + } + (*fn)(opaque, databuf_, databuf_strides_, lines_left); + } + return FinishDecode(); +} + +void init_source(j_decompress_ptr cinfo) { + fill_input_buffer(cinfo); +} + +boolean fill_input_buffer(j_decompress_ptr cinfo) { + BufferVector* buf_vec = reinterpret_cast(cinfo->client_data); + if (buf_vec->pos >= buf_vec->len) { + assert(0 && "No more data"); + // ERROR: No more data + return FALSE; + } + cinfo->src->next_input_byte = buf_vec->buffers[buf_vec->pos].data; + cinfo->src->bytes_in_buffer = buf_vec->buffers[buf_vec->pos].len; + ++buf_vec->pos; + return TRUE; +} + +void skip_input_data(j_decompress_ptr cinfo, + long num_bytes) { // NOLINT + cinfo->src->next_input_byte += num_bytes; +} + +void term_source(j_decompress_ptr cinfo) { + // Nothing to do. +} + +#ifdef HAVE_SETJMP +void ErrorHandler(j_common_ptr cinfo) { + // This is called when a jpeglib command experiences an error. Unfortunately + // jpeglib's error handling model is not very flexible, because it expects the + // error handler to not return--i.e., it wants the program to terminate. To + // recover from errors we use setjmp() as shown in their example. setjmp() is + // C's implementation for the "call with current continuation" functionality + // seen in some functional programming languages. + // A formatted message can be output, but is unsafe for release. +#ifdef DEBUG + char buf[JMSG_LENGTH_MAX]; + (*cinfo->err->format_message)(cinfo, buf); + // ERROR: Error in jpeglib: buf +#endif + + SetJmpErrorMgr* mgr = reinterpret_cast(cinfo->err); + // This rewinds the call stack to the point of the corresponding setjmp() + // and causes it to return (for a second time) with value 1. + longjmp(mgr->setjmp_buffer, 1); +} +#endif + +void MJpegDecoder::AllocOutputBuffers(int num_outbufs) { + if (num_outbufs != num_outbufs_) { + // We could perhaps optimize this case to resize the output buffers without + // necessarily having to delete and recreate each one, but it's not worth + // it. + DestroyOutputBuffers(); + + scanlines_ = new uint8** [num_outbufs]; + scanlines_sizes_ = new int[num_outbufs]; + databuf_ = new uint8* [num_outbufs]; + databuf_strides_ = new int[num_outbufs]; + + for (int i = 0; i < num_outbufs; ++i) { + scanlines_[i] = NULL; + scanlines_sizes_[i] = 0; + databuf_[i] = NULL; + databuf_strides_[i] = 0; + } + + num_outbufs_ = num_outbufs; + } +} + +void MJpegDecoder::DestroyOutputBuffers() { + for (int i = 0; i < num_outbufs_; ++i) { + delete [] scanlines_[i]; + delete [] databuf_[i]; + } + delete [] scanlines_; + delete [] databuf_; + delete [] scanlines_sizes_; + delete [] databuf_strides_; + scanlines_ = NULL; + databuf_ = NULL; + scanlines_sizes_ = NULL; + databuf_strides_ = NULL; + num_outbufs_ = 0; +} + +// JDCT_IFAST and do_block_smoothing improve performance substantially. +LIBYUV_BOOL MJpegDecoder::StartDecode() { + decompress_struct_->raw_data_out = TRUE; + decompress_struct_->dct_method = JDCT_IFAST; // JDCT_ISLOW is default + decompress_struct_->dither_mode = JDITHER_NONE; + // Not applicable to 'raw': + decompress_struct_->do_fancy_upsampling = (boolean)(LIBYUV_FALSE); + // Only for buffered mode: + decompress_struct_->enable_2pass_quant = (boolean)(LIBYUV_FALSE); + // Blocky but fast: + decompress_struct_->do_block_smoothing = (boolean)(LIBYUV_FALSE); + + if (!jpeg_start_decompress(decompress_struct_)) { + // ERROR: Couldn't start JPEG decompressor"; + return LIBYUV_FALSE; + } + return LIBYUV_TRUE; +} + +LIBYUV_BOOL MJpegDecoder::FinishDecode() { + // jpeglib considers it an error if we finish without decoding the whole + // image, so we call "abort" rather than "finish". + jpeg_abort_decompress(decompress_struct_); + return LIBYUV_TRUE; +} + +void MJpegDecoder::SetScanlinePointers(uint8** data) { + for (int i = 0; i < num_outbufs_; ++i) { + uint8* data_i = data[i]; + for (int j = 0; j < scanlines_sizes_[i]; ++j) { + scanlines_[i][j] = data_i; + data_i += GetComponentStride(i); + } + } +} + +inline LIBYUV_BOOL MJpegDecoder::DecodeImcuRow() { + return (unsigned int)(GetImageScanlinesPerImcuRow()) == + jpeg_read_raw_data(decompress_struct_, + scanlines_, + GetImageScanlinesPerImcuRow()); +} + +// The helper function which recognizes the jpeg sub-sampling type. +JpegSubsamplingType MJpegDecoder::JpegSubsamplingTypeHelper( + int* subsample_x, int* subsample_y, int number_of_components) { + if (number_of_components == 3) { // Color images. + if (subsample_x[0] == 1 && subsample_y[0] == 1 && + subsample_x[1] == 2 && subsample_y[1] == 2 && + subsample_x[2] == 2 && subsample_y[2] == 2) { + return kJpegYuv420; + } else if (subsample_x[0] == 1 && subsample_y[0] == 1 && + subsample_x[1] == 2 && subsample_y[1] == 1 && + subsample_x[2] == 2 && subsample_y[2] == 1) { + return kJpegYuv422; + } else if (subsample_x[0] == 1 && subsample_y[0] == 1 && + subsample_x[1] == 1 && subsample_y[1] == 1 && + subsample_x[2] == 1 && subsample_y[2] == 1) { + return kJpegYuv444; + } + } else if (number_of_components == 1) { // Grey-scale images. + if (subsample_x[0] == 1 && subsample_y[0] == 1) { + return kJpegYuv400; + } + } + return kJpegUnknown; +} + +} // namespace libyuv +#endif // HAVE_JPEG + diff --git a/TMessagesProj/jni/libyuv/source/mjpeg_validate.cc b/TMessagesProj/jni/libyuv/source/mjpeg_validate.cc new file mode 100644 index 000000000..23d22d099 --- /dev/null +++ b/TMessagesProj/jni/libyuv/source/mjpeg_validate.cc @@ -0,0 +1,47 @@ +/* + * Copyright 2012 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/mjpeg_decoder.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// Helper function to validate the jpeg appears intact. +// TODO(fbarchard): Optimize case where SOI is found but EOI is not. +LIBYUV_BOOL ValidateJpeg(const uint8* sample, size_t sample_size) { + size_t i; + if (sample_size < 64) { + // ERROR: Invalid jpeg size: sample_size + return LIBYUV_FALSE; + } + if (sample[0] != 0xff || sample[1] != 0xd8) { // Start Of Image + // ERROR: Invalid jpeg initial start code + return LIBYUV_FALSE; + } + for (i = sample_size - 2; i > 1;) { + if (sample[i] != 0xd9) { + if (sample[i] == 0xff && sample[i + 1] == 0xd9) { // End Of Image + return LIBYUV_TRUE; // Success: Valid jpeg. + } + --i; + } + --i; + } + // ERROR: Invalid jpeg end code not found. Size sample_size + return LIBYUV_FALSE; +} + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif + diff --git a/TMessagesProj/jni/libyuv/source/planar_functions.cc b/TMessagesProj/jni/libyuv/source/planar_functions.cc new file mode 100644 index 000000000..3857008ca --- /dev/null +++ b/TMessagesProj/jni/libyuv/source/planar_functions.cc @@ -0,0 +1,2291 @@ +/* + * Copyright 2011 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/planar_functions.h" + +#include // for memset() + +#include "libyuv/cpu_id.h" +#ifdef HAVE_JPEG +#include "libyuv/mjpeg_decoder.h" +#endif +#include "libyuv/row.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// Copy a plane of data +LIBYUV_API +void CopyPlane(const uint8* src_y, int src_stride_y, + uint8* dst_y, int dst_stride_y, + int width, int height) { + int y; + void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C; + // Coalesce rows. + if (src_stride_y == width && + dst_stride_y == width) { + width *= height; + height = 1; + src_stride_y = dst_stride_y = 0; + } + // Nothing to do. + if (src_y == dst_y && src_stride_y == dst_stride_y) { + return; + } +#if defined(HAS_COPYROW_X86) + if (TestCpuFlag(kCpuHasX86) && IS_ALIGNED(width, 4)) { + CopyRow = CopyRow_X86; + } +#endif +#if defined(HAS_COPYROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32) && + IS_ALIGNED(src_y, 16) && IS_ALIGNED(src_stride_y, 16) && + IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { + CopyRow = CopyRow_SSE2; + } +#endif +#if defined(HAS_COPYROW_ERMS) + if (TestCpuFlag(kCpuHasERMS)) { + CopyRow = CopyRow_ERMS; + } +#endif +#if defined(HAS_COPYROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 32)) { + CopyRow = CopyRow_NEON; + } +#endif +#if defined(HAS_COPYROW_MIPS) + if (TestCpuFlag(kCpuHasMIPS)) { + CopyRow = CopyRow_MIPS; + } +#endif + + // Copy plane + for (y = 0; y < height; ++y) { + CopyRow(src_y, dst_y, width); + src_y += src_stride_y; + dst_y += dst_stride_y; + } +} + +LIBYUV_API +void CopyPlane_16(const uint16* src_y, int src_stride_y, + uint16* dst_y, int dst_stride_y, + int width, int height) { + int y; + void (*CopyRow)(const uint16* src, uint16* dst, int width) = CopyRow_16_C; + // Coalesce rows. + if (src_stride_y == width && + dst_stride_y == width) { + width *= height; + height = 1; + src_stride_y = dst_stride_y = 0; + } +#if defined(HAS_COPYROW_16_X86) + if (TestCpuFlag(kCpuHasX86) && IS_ALIGNED(width, 4)) { + CopyRow = CopyRow_16_X86; + } +#endif +#if defined(HAS_COPYROW_16_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32) && + IS_ALIGNED(src_y, 16) && IS_ALIGNED(src_stride_y, 16) && + IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { + CopyRow = CopyRow_16_SSE2; + } +#endif +#if defined(HAS_COPYROW_16_ERMS) + if (TestCpuFlag(kCpuHasERMS)) { + CopyRow = CopyRow_16_ERMS; + } +#endif +#if defined(HAS_COPYROW_16_NEON) + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 32)) { + CopyRow = CopyRow_16_NEON; + } +#endif +#if defined(HAS_COPYROW_16_MIPS) + if (TestCpuFlag(kCpuHasMIPS)) { + CopyRow = CopyRow_16_MIPS; + } +#endif + + // Copy plane + for (y = 0; y < height; ++y) { + CopyRow(src_y, dst_y, width); + src_y += src_stride_y; + dst_y += dst_stride_y; + } +} + +// Copy I422. +LIBYUV_API +int I422Copy(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height) { + int halfwidth = (width + 1) >> 1; + if (!src_y || !src_u || !src_v || + !dst_y || !dst_u || !dst_v || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_y = src_y + (height - 1) * src_stride_y; + src_u = src_u + (height - 1) * src_stride_u; + src_v = src_v + (height - 1) * src_stride_v; + src_stride_y = -src_stride_y; + src_stride_u = -src_stride_u; + src_stride_v = -src_stride_v; + } + CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height); + CopyPlane(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, height); + CopyPlane(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, height); + return 0; +} + +// Copy I444. +LIBYUV_API +int I444Copy(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height) { + if (!src_y || !src_u || !src_v || + !dst_y || !dst_u || !dst_v || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_y = src_y + (height - 1) * src_stride_y; + src_u = src_u + (height - 1) * src_stride_u; + src_v = src_v + (height - 1) * src_stride_v; + src_stride_y = -src_stride_y; + src_stride_u = -src_stride_u; + src_stride_v = -src_stride_v; + } + + CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height); + CopyPlane(src_u, src_stride_u, dst_u, dst_stride_u, width, height); + CopyPlane(src_v, src_stride_v, dst_v, dst_stride_v, width, height); + return 0; +} + +// Copy I400. +LIBYUV_API +int I400ToI400(const uint8* src_y, int src_stride_y, + uint8* dst_y, int dst_stride_y, + int width, int height) { + if (!src_y || !dst_y || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_y = src_y + (height - 1) * src_stride_y; + src_stride_y = -src_stride_y; + } + CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height); + return 0; +} + +// Convert I420 to I400. +LIBYUV_API +int I420ToI400(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_y, int dst_stride_y, + int width, int height) { + if (!src_y || !dst_y || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_y = src_y + (height - 1) * src_stride_y; + src_stride_y = -src_stride_y; + } + CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height); + return 0; +} + +// Mirror a plane of data. +void MirrorPlane(const uint8* src_y, int src_stride_y, + uint8* dst_y, int dst_stride_y, + int width, int height) { + int y; + void (*MirrorRow)(const uint8* src, uint8* dst, int width) = MirrorRow_C; + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_y = src_y + (height - 1) * src_stride_y; + src_stride_y = -src_stride_y; + } +#if defined(HAS_MIRRORROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16)) { + MirrorRow = MirrorRow_NEON; + } +#endif +#if defined(HAS_MIRRORROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16)) { + MirrorRow = MirrorRow_SSE2; + } +#endif +#if defined(HAS_MIRRORROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16) && + IS_ALIGNED(src_y, 16) && IS_ALIGNED(src_stride_y, 16) && + IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { + MirrorRow = MirrorRow_SSSE3; + } +#endif +#if defined(HAS_MIRRORROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 32)) { + MirrorRow = MirrorRow_AVX2; + } +#endif + + // Mirror plane + for (y = 0; y < height; ++y) { + MirrorRow(src_y, dst_y, width); + src_y += src_stride_y; + dst_y += dst_stride_y; + } +} + +// Convert YUY2 to I422. +LIBYUV_API +int YUY2ToI422(const uint8* src_yuy2, int src_stride_yuy2, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height) { + int y; + void (*YUY2ToUV422Row)(const uint8* src_yuy2, + uint8* dst_u, uint8* dst_v, int pix) = + YUY2ToUV422Row_C; + void (*YUY2ToYRow)(const uint8* src_yuy2, uint8* dst_y, int pix) = + YUY2ToYRow_C; + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_yuy2 = src_yuy2 + (height - 1) * src_stride_yuy2; + src_stride_yuy2 = -src_stride_yuy2; + } + // Coalesce rows. + if (src_stride_yuy2 == width * 2 && + dst_stride_y == width && + dst_stride_u * 2 == width && + dst_stride_v * 2 == width) { + width *= height; + height = 1; + src_stride_yuy2 = dst_stride_y = dst_stride_u = dst_stride_v = 0; + } +#if defined(HAS_YUY2TOYROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && width >= 16) { + YUY2ToUV422Row = YUY2ToUV422Row_Any_SSE2; + YUY2ToYRow = YUY2ToYRow_Any_SSE2; + if (IS_ALIGNED(width, 16)) { + YUY2ToUV422Row = YUY2ToUV422Row_Unaligned_SSE2; + YUY2ToYRow = YUY2ToYRow_Unaligned_SSE2; + if (IS_ALIGNED(src_yuy2, 16) && IS_ALIGNED(src_stride_yuy2, 16)) { + YUY2ToUV422Row = YUY2ToUV422Row_SSE2; + if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { + YUY2ToYRow = YUY2ToYRow_SSE2; + } + } + } + } +#endif +#if defined(HAS_YUY2TOYROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2) && width >= 32) { + YUY2ToUV422Row = YUY2ToUV422Row_Any_AVX2; + YUY2ToYRow = YUY2ToYRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + YUY2ToUV422Row = YUY2ToUV422Row_AVX2; + YUY2ToYRow = YUY2ToYRow_AVX2; + } + } +#endif +#if defined(HAS_YUY2TOYROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + YUY2ToYRow = YUY2ToYRow_Any_NEON; + if (width >= 16) { + YUY2ToUV422Row = YUY2ToUV422Row_Any_NEON; + } + if (IS_ALIGNED(width, 16)) { + YUY2ToYRow = YUY2ToYRow_NEON; + YUY2ToUV422Row = YUY2ToUV422Row_NEON; + } + } +#endif + + for (y = 0; y < height; ++y) { + YUY2ToUV422Row(src_yuy2, dst_u, dst_v, width); + YUY2ToYRow(src_yuy2, dst_y, width); + src_yuy2 += src_stride_yuy2; + dst_y += dst_stride_y; + dst_u += dst_stride_u; + dst_v += dst_stride_v; + } + return 0; +} + +// Convert UYVY to I422. +LIBYUV_API +int UYVYToI422(const uint8* src_uyvy, int src_stride_uyvy, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height) { + int y; + void (*UYVYToUV422Row)(const uint8* src_uyvy, + uint8* dst_u, uint8* dst_v, int pix) = + UYVYToUV422Row_C; + void (*UYVYToYRow)(const uint8* src_uyvy, + uint8* dst_y, int pix) = UYVYToYRow_C; + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_uyvy = src_uyvy + (height - 1) * src_stride_uyvy; + src_stride_uyvy = -src_stride_uyvy; + } + // Coalesce rows. + if (src_stride_uyvy == width * 2 && + dst_stride_y == width && + dst_stride_u * 2 == width && + dst_stride_v * 2 == width) { + width *= height; + height = 1; + src_stride_uyvy = dst_stride_y = dst_stride_u = dst_stride_v = 0; + } +#if defined(HAS_UYVYTOYROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && width >= 16) { + UYVYToUV422Row = UYVYToUV422Row_Any_SSE2; + UYVYToYRow = UYVYToYRow_Any_SSE2; + if (IS_ALIGNED(width, 16)) { + UYVYToUV422Row = UYVYToUV422Row_Unaligned_SSE2; + UYVYToYRow = UYVYToYRow_Unaligned_SSE2; + if (IS_ALIGNED(src_uyvy, 16) && IS_ALIGNED(src_stride_uyvy, 16)) { + UYVYToUV422Row = UYVYToUV422Row_SSE2; + if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { + UYVYToYRow = UYVYToYRow_SSE2; + } + } + } + } +#endif +#if defined(HAS_UYVYTOYROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2) && width >= 32) { + UYVYToUV422Row = UYVYToUV422Row_Any_AVX2; + UYVYToYRow = UYVYToYRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + UYVYToUV422Row = UYVYToUV422Row_AVX2; + UYVYToYRow = UYVYToYRow_AVX2; + } + } +#endif +#if defined(HAS_UYVYTOYROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + UYVYToYRow = UYVYToYRow_Any_NEON; + if (width >= 16) { + UYVYToUV422Row = UYVYToUV422Row_Any_NEON; + } + if (IS_ALIGNED(width, 16)) { + UYVYToYRow = UYVYToYRow_NEON; + UYVYToUV422Row = UYVYToUV422Row_NEON; + } + } +#endif + + for (y = 0; y < height; ++y) { + UYVYToUV422Row(src_uyvy, dst_u, dst_v, width); + UYVYToYRow(src_uyvy, dst_y, width); + src_uyvy += src_stride_uyvy; + dst_y += dst_stride_y; + dst_u += dst_stride_u; + dst_v += dst_stride_v; + } + return 0; +} + +// Mirror I400 with optional flipping +LIBYUV_API +int I400Mirror(const uint8* src_y, int src_stride_y, + uint8* dst_y, int dst_stride_y, + int width, int height) { + if (!src_y || !dst_y || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_y = src_y + (height - 1) * src_stride_y; + src_stride_y = -src_stride_y; + } + + MirrorPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height); + return 0; +} + +// Mirror I420 with optional flipping +LIBYUV_API +int I420Mirror(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height) { + int halfwidth = (width + 1) >> 1; + int halfheight = (height + 1) >> 1; + if (!src_y || !src_u || !src_v || !dst_y || !dst_u || !dst_v || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + halfheight = (height + 1) >> 1; + src_y = src_y + (height - 1) * src_stride_y; + src_u = src_u + (halfheight - 1) * src_stride_u; + src_v = src_v + (halfheight - 1) * src_stride_v; + src_stride_y = -src_stride_y; + src_stride_u = -src_stride_u; + src_stride_v = -src_stride_v; + } + + if (dst_y) { + MirrorPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height); + } + MirrorPlane(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, halfheight); + MirrorPlane(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, halfheight); + return 0; +} + +// ARGB mirror. +LIBYUV_API +int ARGBMirror(const uint8* src_argb, int src_stride_argb, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { + int y; + void (*ARGBMirrorRow)(const uint8* src, uint8* dst, int width) = + ARGBMirrorRow_C; + if (!src_argb || !dst_argb || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } + +#if defined(HAS_ARGBMIRRORROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 4) && + IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) && + IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + ARGBMirrorRow = ARGBMirrorRow_SSSE3; + } +#endif +#if defined(HAS_ARGBMIRRORROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 8)) { + ARGBMirrorRow = ARGBMirrorRow_AVX2; + } +#endif +#if defined(HAS_ARGBMIRRORROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 4)) { + ARGBMirrorRow = ARGBMirrorRow_NEON; + } +#endif + + // Mirror plane + for (y = 0; y < height; ++y) { + ARGBMirrorRow(src_argb, dst_argb, width); + src_argb += src_stride_argb; + dst_argb += dst_stride_argb; + } + return 0; +} + +// Get a blender that optimized for the CPU, alignment and pixel count. +// As there are 6 blenders to choose from, the caller should try to use +// the same blend function for all pixels if possible. +LIBYUV_API +ARGBBlendRow GetARGBBlend() { + void (*ARGBBlendRow)(const uint8* src_argb, const uint8* src_argb1, + uint8* dst_argb, int width) = ARGBBlendRow_C; +#if defined(HAS_ARGBBLENDROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBBlendRow = ARGBBlendRow_SSSE3; + return ARGBBlendRow; + } +#endif +#if defined(HAS_ARGBBLENDROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + ARGBBlendRow = ARGBBlendRow_SSE2; + } +#endif +#if defined(HAS_ARGBBLENDROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBBlendRow = ARGBBlendRow_NEON; + } +#endif + return ARGBBlendRow; +} + +// Alpha Blend 2 ARGB images and store to destination. +LIBYUV_API +int ARGBBlend(const uint8* src_argb0, int src_stride_argb0, + const uint8* src_argb1, int src_stride_argb1, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { + int y; + void (*ARGBBlendRow)(const uint8* src_argb, const uint8* src_argb1, + uint8* dst_argb, int width) = GetARGBBlend(); + if (!src_argb0 || !src_argb1 || !dst_argb || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_argb = dst_argb + (height - 1) * dst_stride_argb; + dst_stride_argb = -dst_stride_argb; + } + // Coalesce rows. + if (src_stride_argb0 == width * 4 && + src_stride_argb1 == width * 4 && + dst_stride_argb == width * 4) { + width *= height; + height = 1; + src_stride_argb0 = src_stride_argb1 = dst_stride_argb = 0; + } + + for (y = 0; y < height; ++y) { + ARGBBlendRow(src_argb0, src_argb1, dst_argb, width); + src_argb0 += src_stride_argb0; + src_argb1 += src_stride_argb1; + dst_argb += dst_stride_argb; + } + return 0; +} + +// Multiply 2 ARGB images and store to destination. +LIBYUV_API +int ARGBMultiply(const uint8* src_argb0, int src_stride_argb0, + const uint8* src_argb1, int src_stride_argb1, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { + int y; + void (*ARGBMultiplyRow)(const uint8* src0, const uint8* src1, uint8* dst, + int width) = ARGBMultiplyRow_C; + if (!src_argb0 || !src_argb1 || !dst_argb || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_argb = dst_argb + (height - 1) * dst_stride_argb; + dst_stride_argb = -dst_stride_argb; + } + // Coalesce rows. + if (src_stride_argb0 == width * 4 && + src_stride_argb1 == width * 4 && + dst_stride_argb == width * 4) { + width *= height; + height = 1; + src_stride_argb0 = src_stride_argb1 = dst_stride_argb = 0; + } +#if defined(HAS_ARGBMULTIPLYROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && width >= 4) { + ARGBMultiplyRow = ARGBMultiplyRow_Any_SSE2; + if (IS_ALIGNED(width, 4)) { + ARGBMultiplyRow = ARGBMultiplyRow_SSE2; + } + } +#endif +#if defined(HAS_ARGBMULTIPLYROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2) && width >= 8) { + ARGBMultiplyRow = ARGBMultiplyRow_Any_AVX2; + if (IS_ALIGNED(width, 8)) { + ARGBMultiplyRow = ARGBMultiplyRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBMULTIPLYROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + ARGBMultiplyRow = ARGBMultiplyRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGBMultiplyRow = ARGBMultiplyRow_NEON; + } + } +#endif + + // Multiply plane + for (y = 0; y < height; ++y) { + ARGBMultiplyRow(src_argb0, src_argb1, dst_argb, width); + src_argb0 += src_stride_argb0; + src_argb1 += src_stride_argb1; + dst_argb += dst_stride_argb; + } + return 0; +} + +// Add 2 ARGB images and store to destination. +LIBYUV_API +int ARGBAdd(const uint8* src_argb0, int src_stride_argb0, + const uint8* src_argb1, int src_stride_argb1, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { + int y; + void (*ARGBAddRow)(const uint8* src0, const uint8* src1, uint8* dst, + int width) = ARGBAddRow_C; + if (!src_argb0 || !src_argb1 || !dst_argb || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_argb = dst_argb + (height - 1) * dst_stride_argb; + dst_stride_argb = -dst_stride_argb; + } + // Coalesce rows. + if (src_stride_argb0 == width * 4 && + src_stride_argb1 == width * 4 && + dst_stride_argb == width * 4) { + width *= height; + height = 1; + src_stride_argb0 = src_stride_argb1 = dst_stride_argb = 0; + } +#if defined(HAS_ARGBADDROW_SSE2) && defined(_MSC_VER) + if (TestCpuFlag(kCpuHasSSE2)) { + ARGBAddRow = ARGBAddRow_SSE2; + } +#endif +#if defined(HAS_ARGBADDROW_SSE2) && !defined(_MSC_VER) + if (TestCpuFlag(kCpuHasSSE2) && width >= 4) { + ARGBAddRow = ARGBAddRow_Any_SSE2; + if (IS_ALIGNED(width, 4)) { + ARGBAddRow = ARGBAddRow_SSE2; + } + } +#endif +#if defined(HAS_ARGBADDROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2) && width >= 8) { + ARGBAddRow = ARGBAddRow_Any_AVX2; + if (IS_ALIGNED(width, 8)) { + ARGBAddRow = ARGBAddRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBADDROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + ARGBAddRow = ARGBAddRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGBAddRow = ARGBAddRow_NEON; + } + } +#endif + + // Add plane + for (y = 0; y < height; ++y) { + ARGBAddRow(src_argb0, src_argb1, dst_argb, width); + src_argb0 += src_stride_argb0; + src_argb1 += src_stride_argb1; + dst_argb += dst_stride_argb; + } + return 0; +} + +// Subtract 2 ARGB images and store to destination. +LIBYUV_API +int ARGBSubtract(const uint8* src_argb0, int src_stride_argb0, + const uint8* src_argb1, int src_stride_argb1, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { + int y; + void (*ARGBSubtractRow)(const uint8* src0, const uint8* src1, uint8* dst, + int width) = ARGBSubtractRow_C; + if (!src_argb0 || !src_argb1 || !dst_argb || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_argb = dst_argb + (height - 1) * dst_stride_argb; + dst_stride_argb = -dst_stride_argb; + } + // Coalesce rows. + if (src_stride_argb0 == width * 4 && + src_stride_argb1 == width * 4 && + dst_stride_argb == width * 4) { + width *= height; + height = 1; + src_stride_argb0 = src_stride_argb1 = dst_stride_argb = 0; + } +#if defined(HAS_ARGBSUBTRACTROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && width >= 4) { + ARGBSubtractRow = ARGBSubtractRow_Any_SSE2; + if (IS_ALIGNED(width, 4)) { + ARGBSubtractRow = ARGBSubtractRow_SSE2; + } + } +#endif +#if defined(HAS_ARGBSUBTRACTROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2) && width >= 8) { + ARGBSubtractRow = ARGBSubtractRow_Any_AVX2; + if (IS_ALIGNED(width, 8)) { + ARGBSubtractRow = ARGBSubtractRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBSUBTRACTROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + ARGBSubtractRow = ARGBSubtractRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGBSubtractRow = ARGBSubtractRow_NEON; + } + } +#endif + + // Subtract plane + for (y = 0; y < height; ++y) { + ARGBSubtractRow(src_argb0, src_argb1, dst_argb, width); + src_argb0 += src_stride_argb0; + src_argb1 += src_stride_argb1; + dst_argb += dst_stride_argb; + } + return 0; +} + +// Convert I422 to BGRA. +LIBYUV_API +int I422ToBGRA(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_bgra, int dst_stride_bgra, + int width, int height) { + int y; + void (*I422ToBGRARow)(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width) = I422ToBGRARow_C; + if (!src_y || !src_u || !src_v || + !dst_bgra || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_bgra = dst_bgra + (height - 1) * dst_stride_bgra; + dst_stride_bgra = -dst_stride_bgra; + } + // Coalesce rows. + if (src_stride_y == width && + src_stride_u * 2 == width && + src_stride_v * 2 == width && + dst_stride_bgra == width * 4) { + width *= height; + height = 1; + src_stride_y = src_stride_u = src_stride_v = dst_stride_bgra = 0; + } +#if defined(HAS_I422TOBGRAROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + I422ToBGRARow = I422ToBGRARow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + I422ToBGRARow = I422ToBGRARow_NEON; + } + } +#elif defined(HAS_I422TOBGRAROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { + I422ToBGRARow = I422ToBGRARow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I422ToBGRARow = I422ToBGRARow_Unaligned_SSSE3; + if (IS_ALIGNED(dst_bgra, 16) && IS_ALIGNED(dst_stride_bgra, 16)) { + I422ToBGRARow = I422ToBGRARow_SSSE3; + } + } + } +#elif defined(HAS_I422TOBGRAROW_MIPS_DSPR2) + if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) && + IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) && + IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) && + IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) && + IS_ALIGNED(dst_bgra, 4) && IS_ALIGNED(dst_stride_bgra, 4)) { + I422ToBGRARow = I422ToBGRARow_MIPS_DSPR2; + } +#endif + + for (y = 0; y < height; ++y) { + I422ToBGRARow(src_y, src_u, src_v, dst_bgra, width); + dst_bgra += dst_stride_bgra; + src_y += src_stride_y; + src_u += src_stride_u; + src_v += src_stride_v; + } + return 0; +} + +// Convert I422 to ABGR. +LIBYUV_API +int I422ToABGR(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_abgr, int dst_stride_abgr, + int width, int height) { + int y; + void (*I422ToABGRRow)(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width) = I422ToABGRRow_C; + if (!src_y || !src_u || !src_v || + !dst_abgr || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_abgr = dst_abgr + (height - 1) * dst_stride_abgr; + dst_stride_abgr = -dst_stride_abgr; + } + // Coalesce rows. + if (src_stride_y == width && + src_stride_u * 2 == width && + src_stride_v * 2 == width && + dst_stride_abgr == width * 4) { + width *= height; + height = 1; + src_stride_y = src_stride_u = src_stride_v = dst_stride_abgr = 0; + } +#if defined(HAS_I422TOABGRROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + I422ToABGRRow = I422ToABGRRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + I422ToABGRRow = I422ToABGRRow_NEON; + } + } +#elif defined(HAS_I422TOABGRROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { + I422ToABGRRow = I422ToABGRRow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I422ToABGRRow = I422ToABGRRow_Unaligned_SSSE3; + if (IS_ALIGNED(dst_abgr, 16) && IS_ALIGNED(dst_stride_abgr, 16)) { + I422ToABGRRow = I422ToABGRRow_SSSE3; + } + } + } +#endif + + for (y = 0; y < height; ++y) { + I422ToABGRRow(src_y, src_u, src_v, dst_abgr, width); + dst_abgr += dst_stride_abgr; + src_y += src_stride_y; + src_u += src_stride_u; + src_v += src_stride_v; + } + return 0; +} + +// Convert I422 to RGBA. +LIBYUV_API +int I422ToRGBA(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_rgba, int dst_stride_rgba, + int width, int height) { + int y; + void (*I422ToRGBARow)(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width) = I422ToRGBARow_C; + if (!src_y || !src_u || !src_v || + !dst_rgba || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_rgba = dst_rgba + (height - 1) * dst_stride_rgba; + dst_stride_rgba = -dst_stride_rgba; + } + // Coalesce rows. + if (src_stride_y == width && + src_stride_u * 2 == width && + src_stride_v * 2 == width && + dst_stride_rgba == width * 4) { + width *= height; + height = 1; + src_stride_y = src_stride_u = src_stride_v = dst_stride_rgba = 0; + } +#if defined(HAS_I422TORGBAROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + I422ToRGBARow = I422ToRGBARow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + I422ToRGBARow = I422ToRGBARow_NEON; + } + } +#elif defined(HAS_I422TORGBAROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { + I422ToRGBARow = I422ToRGBARow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I422ToRGBARow = I422ToRGBARow_Unaligned_SSSE3; + if (IS_ALIGNED(dst_rgba, 16) && IS_ALIGNED(dst_stride_rgba, 16)) { + I422ToRGBARow = I422ToRGBARow_SSSE3; + } + } + } +#endif + + for (y = 0; y < height; ++y) { + I422ToRGBARow(src_y, src_u, src_v, dst_rgba, width); + dst_rgba += dst_stride_rgba; + src_y += src_stride_y; + src_u += src_stride_u; + src_v += src_stride_v; + } + return 0; +} + +// Convert NV12 to RGB565. +LIBYUV_API +int NV12ToRGB565(const uint8* src_y, int src_stride_y, + const uint8* src_uv, int src_stride_uv, + uint8* dst_rgb565, int dst_stride_rgb565, + int width, int height) { + int y; + void (*NV12ToRGB565Row)(const uint8* y_buf, + const uint8* uv_buf, + uint8* rgb_buf, + int width) = NV12ToRGB565Row_C; + if (!src_y || !src_uv || !dst_rgb565 || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565; + dst_stride_rgb565 = -dst_stride_rgb565; + } +#if defined(HAS_NV12TORGB565ROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { + NV12ToRGB565Row = NV12ToRGB565Row_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + NV12ToRGB565Row = NV12ToRGB565Row_SSSE3; + } + } +#elif defined(HAS_NV12TORGB565ROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + NV12ToRGB565Row = NV12ToRGB565Row_Any_NEON; + if (IS_ALIGNED(width, 8)) { + NV12ToRGB565Row = NV12ToRGB565Row_NEON; + } + } +#endif + + for (y = 0; y < height; ++y) { + NV12ToRGB565Row(src_y, src_uv, dst_rgb565, width); + dst_rgb565 += dst_stride_rgb565; + src_y += src_stride_y; + if (y & 1) { + src_uv += src_stride_uv; + } + } + return 0; +} + +// Convert NV21 to RGB565. +LIBYUV_API +int NV21ToRGB565(const uint8* src_y, int src_stride_y, + const uint8* src_vu, int src_stride_vu, + uint8* dst_rgb565, int dst_stride_rgb565, + int width, int height) { + int y; + void (*NV21ToRGB565Row)(const uint8* y_buf, + const uint8* src_vu, + uint8* rgb_buf, + int width) = NV21ToRGB565Row_C; + if (!src_y || !src_vu || !dst_rgb565 || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565; + dst_stride_rgb565 = -dst_stride_rgb565; + } +#if defined(HAS_NV21TORGB565ROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { + NV21ToRGB565Row = NV21ToRGB565Row_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + NV21ToRGB565Row = NV21ToRGB565Row_SSSE3; + } + } +#elif defined(HAS_NV21TORGB565ROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + NV21ToRGB565Row = NV21ToRGB565Row_Any_NEON; + if (IS_ALIGNED(width, 8)) { + NV21ToRGB565Row = NV21ToRGB565Row_NEON; + } + } +#endif + + for (y = 0; y < height; ++y) { + NV21ToRGB565Row(src_y, src_vu, dst_rgb565, width); + dst_rgb565 += dst_stride_rgb565; + src_y += src_stride_y; + if (y & 1) { + src_vu += src_stride_vu; + } + } + return 0; +} + +LIBYUV_API +void SetPlane(uint8* dst_y, int dst_stride_y, + int width, int height, + uint32 value) { + int y; + uint32 v32 = value | (value << 8) | (value << 16) | (value << 24); + void (*SetRow)(uint8* dst, uint32 value, int pix) = SetRow_C; + // Coalesce rows. + if (dst_stride_y == width) { + width *= height; + height = 1; + dst_stride_y = 0; + } +#if defined(HAS_SETROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && + IS_ALIGNED(width, 16) && + IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { + SetRow = SetRow_NEON; + } +#endif +#if defined(HAS_SETROW_X86) + if (TestCpuFlag(kCpuHasX86) && IS_ALIGNED(width, 4)) { + SetRow = SetRow_X86; + } +#endif + + // Set plane + for (y = 0; y < height; ++y) { + SetRow(dst_y, v32, width); + dst_y += dst_stride_y; + } +} + +// Draw a rectangle into I420 +LIBYUV_API +int I420Rect(uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int x, int y, + int width, int height, + int value_y, int value_u, int value_v) { + int halfwidth = (width + 1) >> 1; + int halfheight = (height + 1) >> 1; + uint8* start_y = dst_y + y * dst_stride_y + x; + uint8* start_u = dst_u + (y / 2) * dst_stride_u + (x / 2); + uint8* start_v = dst_v + (y / 2) * dst_stride_v + (x / 2); + if (!dst_y || !dst_u || !dst_v || + width <= 0 || height <= 0 || + x < 0 || y < 0 || + value_y < 0 || value_y > 255 || + value_u < 0 || value_u > 255 || + value_v < 0 || value_v > 255) { + return -1; + } + + SetPlane(start_y, dst_stride_y, width, height, value_y); + SetPlane(start_u, dst_stride_u, halfwidth, halfheight, value_u); + SetPlane(start_v, dst_stride_v, halfwidth, halfheight, value_v); + return 0; +} + +// Draw a rectangle into ARGB +LIBYUV_API +int ARGBRect(uint8* dst_argb, int dst_stride_argb, + int dst_x, int dst_y, + int width, int height, + uint32 value) { + if (!dst_argb || + width <= 0 || height <= 0 || + dst_x < 0 || dst_y < 0) { + return -1; + } + dst_argb += dst_y * dst_stride_argb + dst_x * 4; + // Coalesce rows. + if (dst_stride_argb == width * 4) { + width *= height; + height = 1; + dst_stride_argb = 0; + } +#if defined(HAS_SETROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16) && + IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + ARGBSetRows_NEON(dst_argb, value, width, dst_stride_argb, height); + return 0; + } +#endif +#if defined(HAS_SETROW_X86) + if (TestCpuFlag(kCpuHasX86)) { + ARGBSetRows_X86(dst_argb, value, width, dst_stride_argb, height); + return 0; + } +#endif + ARGBSetRows_C(dst_argb, value, width, dst_stride_argb, height); + return 0; +} + +// Convert unattentuated ARGB to preattenuated ARGB. +// An unattenutated ARGB alpha blend uses the formula +// p = a * f + (1 - a) * b +// where +// p is output pixel +// f is foreground pixel +// b is background pixel +// a is alpha value from foreground pixel +// An preattenutated ARGB alpha blend uses the formula +// p = f + (1 - a) * b +// where +// f is foreground pixel premultiplied by alpha + +LIBYUV_API +int ARGBAttenuate(const uint8* src_argb, int src_stride_argb, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { + int y; + void (*ARGBAttenuateRow)(const uint8* src_argb, uint8* dst_argb, + int width) = ARGBAttenuateRow_C; + if (!src_argb || !dst_argb || width <= 0 || height == 0) { + return -1; + } + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } + // Coalesce rows. + if (src_stride_argb == width * 4 && + dst_stride_argb == width * 4) { + width *= height; + height = 1; + src_stride_argb = dst_stride_argb = 0; + } +#if defined(HAS_ARGBATTENUATEROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && width >= 4 && + IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) && + IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + ARGBAttenuateRow = ARGBAttenuateRow_Any_SSE2; + if (IS_ALIGNED(width, 4)) { + ARGBAttenuateRow = ARGBAttenuateRow_SSE2; + } + } +#endif +#if defined(HAS_ARGBATTENUATEROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && width >= 4) { + ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3; + if (IS_ALIGNED(width, 4)) { + ARGBAttenuateRow = ARGBAttenuateRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBATTENUATEROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2) && width >= 8) { + ARGBAttenuateRow = ARGBAttenuateRow_Any_AVX2; + if (IS_ALIGNED(width, 8)) { + ARGBAttenuateRow = ARGBAttenuateRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBATTENUATEROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + ARGBAttenuateRow = ARGBAttenuateRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGBAttenuateRow = ARGBAttenuateRow_NEON; + } + } +#endif + + for (y = 0; y < height; ++y) { + ARGBAttenuateRow(src_argb, dst_argb, width); + src_argb += src_stride_argb; + dst_argb += dst_stride_argb; + } + return 0; +} + +// Convert preattentuated ARGB to unattenuated ARGB. +LIBYUV_API +int ARGBUnattenuate(const uint8* src_argb, int src_stride_argb, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { + int y; + void (*ARGBUnattenuateRow)(const uint8* src_argb, uint8* dst_argb, + int width) = ARGBUnattenuateRow_C; + if (!src_argb || !dst_argb || width <= 0 || height == 0) { + return -1; + } + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } + // Coalesce rows. + if (src_stride_argb == width * 4 && + dst_stride_argb == width * 4) { + width *= height; + height = 1; + src_stride_argb = dst_stride_argb = 0; + } +#if defined(HAS_ARGBUNATTENUATEROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && width >= 4) { + ARGBUnattenuateRow = ARGBUnattenuateRow_Any_SSE2; + if (IS_ALIGNED(width, 4)) { + ARGBUnattenuateRow = ARGBUnattenuateRow_SSE2; + } + } +#endif +#if defined(HAS_ARGBUNATTENUATEROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2) && width >= 8) { + ARGBUnattenuateRow = ARGBUnattenuateRow_Any_AVX2; + if (IS_ALIGNED(width, 8)) { + ARGBUnattenuateRow = ARGBUnattenuateRow_AVX2; + } + } +#endif +// TODO(fbarchard): Neon version. + + for (y = 0; y < height; ++y) { + ARGBUnattenuateRow(src_argb, dst_argb, width); + src_argb += src_stride_argb; + dst_argb += dst_stride_argb; + } + return 0; +} + +// Convert ARGB to Grayed ARGB. +LIBYUV_API +int ARGBGrayTo(const uint8* src_argb, int src_stride_argb, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { + int y; + void (*ARGBGrayRow)(const uint8* src_argb, uint8* dst_argb, + int width) = ARGBGrayRow_C; + if (!src_argb || !dst_argb || width <= 0 || height == 0) { + return -1; + } + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } + // Coalesce rows. + if (src_stride_argb == width * 4 && + dst_stride_argb == width * 4) { + width *= height; + height = 1; + src_stride_argb = dst_stride_argb = 0; + } +#if defined(HAS_ARGBGRAYROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8) && + IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) && + IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + ARGBGrayRow = ARGBGrayRow_SSSE3; + } +#elif defined(HAS_ARGBGRAYROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) { + ARGBGrayRow = ARGBGrayRow_NEON; + } +#endif + + for (y = 0; y < height; ++y) { + ARGBGrayRow(src_argb, dst_argb, width); + src_argb += src_stride_argb; + dst_argb += dst_stride_argb; + } + return 0; +} + +// Make a rectangle of ARGB gray scale. +LIBYUV_API +int ARGBGray(uint8* dst_argb, int dst_stride_argb, + int dst_x, int dst_y, + int width, int height) { + int y; + void (*ARGBGrayRow)(const uint8* src_argb, uint8* dst_argb, + int width) = ARGBGrayRow_C; + uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4; + if (!dst_argb || width <= 0 || height <= 0 || dst_x < 0 || dst_y < 0) { + return -1; + } + // Coalesce rows. + if (dst_stride_argb == width * 4) { + width *= height; + height = 1; + dst_stride_argb = 0; + } +#if defined(HAS_ARGBGRAYROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8) && + IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + ARGBGrayRow = ARGBGrayRow_SSSE3; + } +#elif defined(HAS_ARGBGRAYROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) { + ARGBGrayRow = ARGBGrayRow_NEON; + } +#endif + for (y = 0; y < height; ++y) { + ARGBGrayRow(dst, dst, width); + dst += dst_stride_argb; + } + return 0; +} + +// Make a rectangle of ARGB Sepia tone. +LIBYUV_API +int ARGBSepia(uint8* dst_argb, int dst_stride_argb, + int dst_x, int dst_y, int width, int height) { + int y; + void (*ARGBSepiaRow)(uint8* dst_argb, int width) = ARGBSepiaRow_C; + uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4; + if (!dst_argb || width <= 0 || height <= 0 || dst_x < 0 || dst_y < 0) { + return -1; + } + // Coalesce rows. + if (dst_stride_argb == width * 4) { + width *= height; + height = 1; + dst_stride_argb = 0; + } +#if defined(HAS_ARGBSEPIAROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8) && + IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + ARGBSepiaRow = ARGBSepiaRow_SSSE3; + } +#elif defined(HAS_ARGBSEPIAROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) { + ARGBSepiaRow = ARGBSepiaRow_NEON; + } +#endif + for (y = 0; y < height; ++y) { + ARGBSepiaRow(dst, width); + dst += dst_stride_argb; + } + return 0; +} + +// Apply a 4x4 matrix to each ARGB pixel. +// Note: Normally for shading, but can be used to swizzle or invert. +LIBYUV_API +int ARGBColorMatrix(const uint8* src_argb, int src_stride_argb, + uint8* dst_argb, int dst_stride_argb, + const int8* matrix_argb, + int width, int height) { + int y; + void (*ARGBColorMatrixRow)(const uint8* src_argb, uint8* dst_argb, + const int8* matrix_argb, int width) = ARGBColorMatrixRow_C; + if (!src_argb || !dst_argb || !matrix_argb || width <= 0 || height == 0) { + return -1; + } + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } + // Coalesce rows. + if (src_stride_argb == width * 4 && + dst_stride_argb == width * 4) { + width *= height; + height = 1; + src_stride_argb = dst_stride_argb = 0; + } +#if defined(HAS_ARGBCOLORMATRIXROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8) && + IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + ARGBColorMatrixRow = ARGBColorMatrixRow_SSSE3; + } +#elif defined(HAS_ARGBCOLORMATRIXROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) { + ARGBColorMatrixRow = ARGBColorMatrixRow_NEON; + } +#endif + for (y = 0; y < height; ++y) { + ARGBColorMatrixRow(src_argb, dst_argb, matrix_argb, width); + src_argb += src_stride_argb; + dst_argb += dst_stride_argb; + } + return 0; +} + +// Apply a 4x3 matrix to each ARGB pixel. +// Deprecated. +LIBYUV_API +int RGBColorMatrix(uint8* dst_argb, int dst_stride_argb, + const int8* matrix_rgb, + int dst_x, int dst_y, int width, int height) { + SIMD_ALIGNED(int8 matrix_argb[16]); + uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4; + if (!dst_argb || !matrix_rgb || width <= 0 || height <= 0 || + dst_x < 0 || dst_y < 0) { + return -1; + } + + // Convert 4x3 7 bit matrix to 4x4 6 bit matrix. + matrix_argb[0] = matrix_rgb[0] / 2; + matrix_argb[1] = matrix_rgb[1] / 2; + matrix_argb[2] = matrix_rgb[2] / 2; + matrix_argb[3] = matrix_rgb[3] / 2; + matrix_argb[4] = matrix_rgb[4] / 2; + matrix_argb[5] = matrix_rgb[5] / 2; + matrix_argb[6] = matrix_rgb[6] / 2; + matrix_argb[7] = matrix_rgb[7] / 2; + matrix_argb[8] = matrix_rgb[8] / 2; + matrix_argb[9] = matrix_rgb[9] / 2; + matrix_argb[10] = matrix_rgb[10] / 2; + matrix_argb[11] = matrix_rgb[11] / 2; + matrix_argb[14] = matrix_argb[13] = matrix_argb[12] = 0; + matrix_argb[15] = 64; // 1.0 + + return ARGBColorMatrix((const uint8*)(dst), dst_stride_argb, + dst, dst_stride_argb, + &matrix_argb[0], width, height); +} + +// Apply a color table each ARGB pixel. +// Table contains 256 ARGB values. +LIBYUV_API +int ARGBColorTable(uint8* dst_argb, int dst_stride_argb, + const uint8* table_argb, + int dst_x, int dst_y, int width, int height) { + int y; + void (*ARGBColorTableRow)(uint8* dst_argb, const uint8* table_argb, + int width) = ARGBColorTableRow_C; + uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4; + if (!dst_argb || !table_argb || width <= 0 || height <= 0 || + dst_x < 0 || dst_y < 0) { + return -1; + } + // Coalesce rows. + if (dst_stride_argb == width * 4) { + width *= height; + height = 1; + dst_stride_argb = 0; + } +#if defined(HAS_ARGBCOLORTABLEROW_X86) + if (TestCpuFlag(kCpuHasX86)) { + ARGBColorTableRow = ARGBColorTableRow_X86; + } +#endif + for (y = 0; y < height; ++y) { + ARGBColorTableRow(dst, table_argb, width); + dst += dst_stride_argb; + } + return 0; +} + +// Apply a color table each ARGB pixel but preserve destination alpha. +// Table contains 256 ARGB values. +LIBYUV_API +int RGBColorTable(uint8* dst_argb, int dst_stride_argb, + const uint8* table_argb, + int dst_x, int dst_y, int width, int height) { + int y; + void (*RGBColorTableRow)(uint8* dst_argb, const uint8* table_argb, + int width) = RGBColorTableRow_C; + uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4; + if (!dst_argb || !table_argb || width <= 0 || height <= 0 || + dst_x < 0 || dst_y < 0) { + return -1; + } + // Coalesce rows. + if (dst_stride_argb == width * 4) { + width *= height; + height = 1; + dst_stride_argb = 0; + } +#if defined(HAS_RGBCOLORTABLEROW_X86) + if (TestCpuFlag(kCpuHasX86)) { + RGBColorTableRow = RGBColorTableRow_X86; + } +#endif + for (y = 0; y < height; ++y) { + RGBColorTableRow(dst, table_argb, width); + dst += dst_stride_argb; + } + return 0; +} + +// ARGBQuantize is used to posterize art. +// e.g. rgb / qvalue * qvalue + qvalue / 2 +// But the low levels implement efficiently with 3 parameters, and could be +// used for other high level operations. +// dst_argb[0] = (b * scale >> 16) * interval_size + interval_offset; +// where scale is 1 / interval_size as a fixed point value. +// The divide is replaces with a multiply by reciprocal fixed point multiply. +// Caveat - although SSE2 saturates, the C function does not and should be used +// with care if doing anything but quantization. +LIBYUV_API +int ARGBQuantize(uint8* dst_argb, int dst_stride_argb, + int scale, int interval_size, int interval_offset, + int dst_x, int dst_y, int width, int height) { + int y; + void (*ARGBQuantizeRow)(uint8* dst_argb, int scale, int interval_size, + int interval_offset, int width) = ARGBQuantizeRow_C; + uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4; + if (!dst_argb || width <= 0 || height <= 0 || dst_x < 0 || dst_y < 0 || + interval_size < 1 || interval_size > 255) { + return -1; + } + // Coalesce rows. + if (dst_stride_argb == width * 4) { + width *= height; + height = 1; + dst_stride_argb = 0; + } +#if defined(HAS_ARGBQUANTIZEROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 4) && + IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + ARGBQuantizeRow = ARGBQuantizeRow_SSE2; + } +#elif defined(HAS_ARGBQUANTIZEROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) { + ARGBQuantizeRow = ARGBQuantizeRow_NEON; + } +#endif + for (y = 0; y < height; ++y) { + ARGBQuantizeRow(dst, scale, interval_size, interval_offset, width); + dst += dst_stride_argb; + } + return 0; +} + +// Computes table of cumulative sum for image where the value is the sum +// of all values above and to the left of the entry. Used by ARGBBlur. +LIBYUV_API +int ARGBComputeCumulativeSum(const uint8* src_argb, int src_stride_argb, + int32* dst_cumsum, int dst_stride32_cumsum, + int width, int height) { + int y; + void (*ComputeCumulativeSumRow)(const uint8* row, int32* cumsum, + const int32* previous_cumsum, int width) = ComputeCumulativeSumRow_C; + int32* previous_cumsum = dst_cumsum; + if (!dst_cumsum || !src_argb || width <= 0 || height <= 0) { + return -1; + } +#if defined(HAS_CUMULATIVESUMTOAVERAGEROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + ComputeCumulativeSumRow = ComputeCumulativeSumRow_SSE2; + } +#endif + memset(dst_cumsum, 0, width * sizeof(dst_cumsum[0]) * 4); // 4 int per pixel. + for (y = 0; y < height; ++y) { + ComputeCumulativeSumRow(src_argb, dst_cumsum, previous_cumsum, width); + previous_cumsum = dst_cumsum; + dst_cumsum += dst_stride32_cumsum; + src_argb += src_stride_argb; + } + return 0; +} + +// Blur ARGB image. +// Caller should allocate CumulativeSum table of width * height * 16 bytes +// aligned to 16 byte boundary. height can be radius * 2 + 2 to save memory +// as the buffer is treated as circular. +LIBYUV_API +int ARGBBlur(const uint8* src_argb, int src_stride_argb, + uint8* dst_argb, int dst_stride_argb, + int32* dst_cumsum, int dst_stride32_cumsum, + int width, int height, int radius) { + int y; + void (*ComputeCumulativeSumRow)(const uint8 *row, int32 *cumsum, + const int32* previous_cumsum, int width) = ComputeCumulativeSumRow_C; + void (*CumulativeSumToAverageRow)(const int32* topleft, const int32* botleft, + int width, int area, uint8* dst, int count) = CumulativeSumToAverageRow_C; + int32* cumsum_bot_row; + int32* max_cumsum_bot_row; + int32* cumsum_top_row; + + if (!src_argb || !dst_argb || width <= 0 || height == 0) { + return -1; + } + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } + if (radius > height) { + radius = height; + } + if (radius > (width / 2 - 1)) { + radius = width / 2 - 1; + } + if (radius <= 0) { + return -1; + } +#if defined(HAS_CUMULATIVESUMTOAVERAGEROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + ComputeCumulativeSumRow = ComputeCumulativeSumRow_SSE2; + CumulativeSumToAverageRow = CumulativeSumToAverageRow_SSE2; + } +#endif + // Compute enough CumulativeSum for first row to be blurred. After this + // one row of CumulativeSum is updated at a time. + ARGBComputeCumulativeSum(src_argb, src_stride_argb, + dst_cumsum, dst_stride32_cumsum, + width, radius); + + src_argb = src_argb + radius * src_stride_argb; + cumsum_bot_row = &dst_cumsum[(radius - 1) * dst_stride32_cumsum]; + + max_cumsum_bot_row = &dst_cumsum[(radius * 2 + 2) * dst_stride32_cumsum]; + cumsum_top_row = &dst_cumsum[0]; + + for (y = 0; y < height; ++y) { + int top_y = ((y - radius - 1) >= 0) ? (y - radius - 1) : 0; + int bot_y = ((y + radius) < height) ? (y + radius) : (height - 1); + int area = radius * (bot_y - top_y); + int boxwidth = radius * 4; + int x; + int n; + + // Increment cumsum_top_row pointer with circular buffer wrap around. + if (top_y) { + cumsum_top_row += dst_stride32_cumsum; + if (cumsum_top_row >= max_cumsum_bot_row) { + cumsum_top_row = dst_cumsum; + } + } + // Increment cumsum_bot_row pointer with circular buffer wrap around and + // then fill in a row of CumulativeSum. + if ((y + radius) < height) { + const int32* prev_cumsum_bot_row = cumsum_bot_row; + cumsum_bot_row += dst_stride32_cumsum; + if (cumsum_bot_row >= max_cumsum_bot_row) { + cumsum_bot_row = dst_cumsum; + } + ComputeCumulativeSumRow(src_argb, cumsum_bot_row, prev_cumsum_bot_row, + width); + src_argb += src_stride_argb; + } + + // Left clipped. + for (x = 0; x < radius + 1; ++x) { + CumulativeSumToAverageRow(cumsum_top_row, cumsum_bot_row, + boxwidth, area, &dst_argb[x * 4], 1); + area += (bot_y - top_y); + boxwidth += 4; + } + + // Middle unclipped. + n = (width - 1) - radius - x + 1; + CumulativeSumToAverageRow(cumsum_top_row, cumsum_bot_row, + boxwidth, area, &dst_argb[x * 4], n); + + // Right clipped. + for (x += n; x <= width - 1; ++x) { + area -= (bot_y - top_y); + boxwidth -= 4; + CumulativeSumToAverageRow(cumsum_top_row + (x - radius - 1) * 4, + cumsum_bot_row + (x - radius - 1) * 4, + boxwidth, area, &dst_argb[x * 4], 1); + } + dst_argb += dst_stride_argb; + } + return 0; +} + +// Multiply ARGB image by a specified ARGB value. +LIBYUV_API +int ARGBShade(const uint8* src_argb, int src_stride_argb, + uint8* dst_argb, int dst_stride_argb, + int width, int height, uint32 value) { + int y; + void (*ARGBShadeRow)(const uint8* src_argb, uint8* dst_argb, + int width, uint32 value) = ARGBShadeRow_C; + if (!src_argb || !dst_argb || width <= 0 || height == 0 || value == 0u) { + return -1; + } + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } + // Coalesce rows. + if (src_stride_argb == width * 4 && + dst_stride_argb == width * 4) { + width *= height; + height = 1; + src_stride_argb = dst_stride_argb = 0; + } +#if defined(HAS_ARGBSHADEROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 4) && + IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) && + IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + ARGBShadeRow = ARGBShadeRow_SSE2; + } +#elif defined(HAS_ARGBSHADEROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) { + ARGBShadeRow = ARGBShadeRow_NEON; + } +#endif + + for (y = 0; y < height; ++y) { + ARGBShadeRow(src_argb, dst_argb, width, value); + src_argb += src_stride_argb; + dst_argb += dst_stride_argb; + } + return 0; +} + +// Interpolate 2 ARGB images by specified amount (0 to 255). +LIBYUV_API +int ARGBInterpolate(const uint8* src_argb0, int src_stride_argb0, + const uint8* src_argb1, int src_stride_argb1, + uint8* dst_argb, int dst_stride_argb, + int width, int height, int interpolation) { + int y; + void (*InterpolateRow)(uint8* dst_ptr, const uint8* src_ptr, + ptrdiff_t src_stride, int dst_width, + int source_y_fraction) = InterpolateRow_C; + if (!src_argb0 || !src_argb1 || !dst_argb || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_argb = dst_argb + (height - 1) * dst_stride_argb; + dst_stride_argb = -dst_stride_argb; + } + // Coalesce rows. + if (src_stride_argb0 == width * 4 && + src_stride_argb1 == width * 4 && + dst_stride_argb == width * 4) { + width *= height; + height = 1; + src_stride_argb0 = src_stride_argb1 = dst_stride_argb = 0; + } +#if defined(HAS_INTERPOLATEROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && width >= 4) { + InterpolateRow = InterpolateRow_Any_SSE2; + if (IS_ALIGNED(width, 4)) { + InterpolateRow = InterpolateRow_Unaligned_SSE2; + if (IS_ALIGNED(src_argb0, 16) && IS_ALIGNED(src_stride_argb0, 16) && + IS_ALIGNED(src_argb1, 16) && IS_ALIGNED(src_stride_argb1, 16) && + IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + InterpolateRow = InterpolateRow_SSE2; + } + } + } +#endif +#if defined(HAS_INTERPOLATEROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && width >= 4) { + InterpolateRow = InterpolateRow_Any_SSSE3; + if (IS_ALIGNED(width, 4)) { + InterpolateRow = InterpolateRow_Unaligned_SSSE3; + if (IS_ALIGNED(src_argb0, 16) && IS_ALIGNED(src_stride_argb0, 16) && + IS_ALIGNED(src_argb1, 16) && IS_ALIGNED(src_stride_argb1, 16) && + IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + InterpolateRow = InterpolateRow_SSSE3; + } + } + } +#endif +#if defined(HAS_INTERPOLATEROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2) && width >= 8) { + InterpolateRow = InterpolateRow_Any_AVX2; + if (IS_ALIGNED(width, 8)) { + InterpolateRow = InterpolateRow_AVX2; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 4) { + InterpolateRow = InterpolateRow_Any_NEON; + if (IS_ALIGNED(width, 4)) { + InterpolateRow = InterpolateRow_NEON; + } + } +#endif +#if defined(HAS_INTERPOLATEROWS_MIPS_DSPR2) + if (TestCpuFlag(kCpuHasMIPS_DSPR2) && width >= 1 && + IS_ALIGNED(src_argb0, 4) && IS_ALIGNED(src_stride_argb0, 4) && + IS_ALIGNED(src_argb1, 4) && IS_ALIGNED(src_stride_argb1, 4) && + IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) { + ScaleARGBFilterRows = InterpolateRow_MIPS_DSPR2; + } +#endif + + for (y = 0; y < height; ++y) { + InterpolateRow(dst_argb, src_argb0, src_argb1 - src_argb0, + width * 4, interpolation); + src_argb0 += src_stride_argb0; + src_argb1 += src_stride_argb1; + dst_argb += dst_stride_argb; + } + return 0; +} + +// Shuffle ARGB channel order. e.g. BGRA to ARGB. +LIBYUV_API +int ARGBShuffle(const uint8* src_bgra, int src_stride_bgra, + uint8* dst_argb, int dst_stride_argb, + const uint8* shuffler, int width, int height) { + int y; + void (*ARGBShuffleRow)(const uint8* src_bgra, uint8* dst_argb, + const uint8* shuffler, int pix) = ARGBShuffleRow_C; + if (!src_bgra || !dst_argb || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_bgra = src_bgra + (height - 1) * src_stride_bgra; + src_stride_bgra = -src_stride_bgra; + } + // Coalesce rows. + if (src_stride_bgra == width * 4 && + dst_stride_argb == width * 4) { + width *= height; + height = 1; + src_stride_bgra = dst_stride_argb = 0; + } +#if defined(HAS_ARGBSHUFFLEROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && width >= 4) { + ARGBShuffleRow = ARGBShuffleRow_Any_SSE2; + if (IS_ALIGNED(width, 4)) { + ARGBShuffleRow = ARGBShuffleRow_SSE2; + } + } +#endif +#if defined(HAS_ARGBSHUFFLEROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { + ARGBShuffleRow = ARGBShuffleRow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + ARGBShuffleRow = ARGBShuffleRow_Unaligned_SSSE3; + if (IS_ALIGNED(src_bgra, 16) && IS_ALIGNED(src_stride_bgra, 16) && + IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + ARGBShuffleRow = ARGBShuffleRow_SSSE3; + } + } + } +#endif +#if defined(HAS_ARGBSHUFFLEROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2) && width >= 16) { + ARGBShuffleRow = ARGBShuffleRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + ARGBShuffleRow = ARGBShuffleRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBSHUFFLEROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 4) { + ARGBShuffleRow = ARGBShuffleRow_Any_NEON; + if (IS_ALIGNED(width, 4)) { + ARGBShuffleRow = ARGBShuffleRow_NEON; + } + } +#endif + + for (y = 0; y < height; ++y) { + ARGBShuffleRow(src_bgra, dst_argb, shuffler, width); + src_bgra += src_stride_bgra; + dst_argb += dst_stride_argb; + } + return 0; +} + +// Sobel ARGB effect. +static int ARGBSobelize(const uint8* src_argb, int src_stride_argb, + uint8* dst_argb, int dst_stride_argb, + int width, int height, + void (*SobelRow)(const uint8* src_sobelx, + const uint8* src_sobely, + uint8* dst, int width)) { + int y; + void (*ARGBToBayerRow)(const uint8* src_argb, uint8* dst_bayer, + uint32 selector, int pix) = ARGBToBayerGGRow_C; + void (*SobelYRow)(const uint8* src_y0, const uint8* src_y1, + uint8* dst_sobely, int width) = SobelYRow_C; + void (*SobelXRow)(const uint8* src_y0, const uint8* src_y1, + const uint8* src_y2, uint8* dst_sobely, int width) = + SobelXRow_C; + const int kEdge = 16; // Extra pixels at start of row for extrude/align. + if (!src_argb || !dst_argb || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } + // ARGBToBayer used to select G channel from ARGB. +#if defined(HAS_ARGBTOBAYERGGROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && width >= 8 && + IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) { + ARGBToBayerRow = ARGBToBayerGGRow_Any_SSE2; + if (IS_ALIGNED(width, 8)) { + ARGBToBayerRow = ARGBToBayerGGRow_SSE2; + } + } +#endif +#if defined(HAS_ARGBTOBAYERROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && width >= 8 && + IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) { + ARGBToBayerRow = ARGBToBayerRow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + ARGBToBayerRow = ARGBToBayerRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBTOBAYERGGROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + ARGBToBayerRow = ARGBToBayerGGRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGBToBayerRow = ARGBToBayerGGRow_NEON; + } + } +#endif +#if defined(HAS_SOBELYROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + SobelYRow = SobelYRow_SSE2; + } +#endif +#if defined(HAS_SOBELYROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + SobelYRow = SobelYRow_NEON; + } +#endif +#if defined(HAS_SOBELXROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + SobelXRow = SobelXRow_SSE2; + } +#endif +#if defined(HAS_SOBELXROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + SobelXRow = SobelXRow_NEON; + } +#endif + { + // 3 rows with edges before/after. + const int kRowSize = (width + kEdge + 15) & ~15; + align_buffer_64(rows, kRowSize * 2 + (kEdge + kRowSize * 3 + kEdge)); + uint8* row_sobelx = rows; + uint8* row_sobely = rows + kRowSize; + uint8* row_y = rows + kRowSize * 2; + + // Convert first row. + uint8* row_y0 = row_y + kEdge; + uint8* row_y1 = row_y0 + kRowSize; + uint8* row_y2 = row_y1 + kRowSize; + ARGBToBayerRow(src_argb, row_y0, 0x0d090501, width); + row_y0[-1] = row_y0[0]; + memset(row_y0 + width, row_y0[width - 1], 16); // Extrude 16 for valgrind. + ARGBToBayerRow(src_argb, row_y1, 0x0d090501, width); + row_y1[-1] = row_y1[0]; + memset(row_y1 + width, row_y1[width - 1], 16); + memset(row_y2 + width, 0, 16); + + for (y = 0; y < height; ++y) { + // Convert next row of ARGB to Y. + if (y < (height - 1)) { + src_argb += src_stride_argb; + } + ARGBToBayerRow(src_argb, row_y2, 0x0d090501, width); + row_y2[-1] = row_y2[0]; + row_y2[width] = row_y2[width - 1]; + + SobelXRow(row_y0 - 1, row_y1 - 1, row_y2 - 1, row_sobelx, width); + SobelYRow(row_y0 - 1, row_y2 - 1, row_sobely, width); + SobelRow(row_sobelx, row_sobely, dst_argb, width); + + // Cycle thru circular queue of 3 row_y buffers. + { + uint8* row_yt = row_y0; + row_y0 = row_y1; + row_y1 = row_y2; + row_y2 = row_yt; + } + + dst_argb += dst_stride_argb; + } + free_aligned_buffer_64(rows); + } + return 0; +} + +// Sobel ARGB effect. +LIBYUV_API +int ARGBSobel(const uint8* src_argb, int src_stride_argb, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { + void (*SobelRow)(const uint8* src_sobelx, const uint8* src_sobely, + uint8* dst_argb, int width) = SobelRow_C; +#if defined(HAS_SOBELROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16) && + IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + SobelRow = SobelRow_SSE2; + } +#endif +#if defined(HAS_SOBELROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) { + SobelRow = SobelRow_NEON; + } +#endif + return ARGBSobelize(src_argb, src_stride_argb, dst_argb, dst_stride_argb, + width, height, SobelRow); +} + +// Sobel ARGB effect with planar output. +LIBYUV_API +int ARGBSobelToPlane(const uint8* src_argb, int src_stride_argb, + uint8* dst_y, int dst_stride_y, + int width, int height) { + void (*SobelToPlaneRow)(const uint8* src_sobelx, const uint8* src_sobely, + uint8* dst_, int width) = SobelToPlaneRow_C; +#if defined(HAS_SOBELTOPLANEROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16) && + IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { + SobelToPlaneRow = SobelToPlaneRow_SSE2; + } +#endif +#if defined(HAS_SOBELTOPLANEROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16)) { + SobelToPlaneRow = SobelToPlaneRow_NEON; + } +#endif + return ARGBSobelize(src_argb, src_stride_argb, dst_y, dst_stride_y, + width, height, SobelToPlaneRow); +} + +// SobelXY ARGB effect. +// Similar to Sobel, but also stores Sobel X in R and Sobel Y in B. G = Sobel. +LIBYUV_API +int ARGBSobelXY(const uint8* src_argb, int src_stride_argb, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { + void (*SobelXYRow)(const uint8* src_sobelx, const uint8* src_sobely, + uint8* dst_argb, int width) = SobelXYRow_C; +#if defined(HAS_SOBELXYROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16) && + IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + SobelXYRow = SobelXYRow_SSE2; + } +#endif +#if defined(HAS_SOBELXYROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) { + SobelXYRow = SobelXYRow_NEON; + } +#endif + return ARGBSobelize(src_argb, src_stride_argb, dst_argb, dst_stride_argb, + width, height, SobelXYRow); +} + +// Apply a 4x4 polynomial to each ARGB pixel. +LIBYUV_API +int ARGBPolynomial(const uint8* src_argb, int src_stride_argb, + uint8* dst_argb, int dst_stride_argb, + const float* poly, + int width, int height) { + int y; + void (*ARGBPolynomialRow)(const uint8* src_argb, + uint8* dst_argb, const float* poly, + int width) = ARGBPolynomialRow_C; + if (!src_argb || !dst_argb || !poly || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } + // Coalesce rows. + if (src_stride_argb == width * 4 && + dst_stride_argb == width * 4) { + width *= height; + height = 1; + src_stride_argb = dst_stride_argb = 0; + } +#if defined(HAS_ARGBPOLYNOMIALROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 2)) { + ARGBPolynomialRow = ARGBPolynomialRow_SSE2; + } +#endif +#if defined(HAS_ARGBPOLYNOMIALROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2) && TestCpuFlag(kCpuHasFMA3) && + IS_ALIGNED(width, 2)) { + ARGBPolynomialRow = ARGBPolynomialRow_AVX2; + } +#endif + + for (y = 0; y < height; ++y) { + ARGBPolynomialRow(src_argb, dst_argb, poly, width); + src_argb += src_stride_argb; + dst_argb += dst_stride_argb; + } + return 0; +} + +// Apply a lumacolortable to each ARGB pixel. +LIBYUV_API +int ARGBLumaColorTable(const uint8* src_argb, int src_stride_argb, + uint8* dst_argb, int dst_stride_argb, + const uint8* luma, + int width, int height) { + int y; + void (*ARGBLumaColorTableRow)(const uint8* src_argb, uint8* dst_argb, + int width, const uint8* luma, const uint32 lumacoeff) = + ARGBLumaColorTableRow_C; + if (!src_argb || !dst_argb || !luma || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } + // Coalesce rows. + if (src_stride_argb == width * 4 && + dst_stride_argb == width * 4) { + width *= height; + height = 1; + src_stride_argb = dst_stride_argb = 0; + } +#if defined(HAS_ARGBLUMACOLORTABLEROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 4)) { + ARGBLumaColorTableRow = ARGBLumaColorTableRow_SSSE3; + } +#endif + + for (y = 0; y < height; ++y) { + ARGBLumaColorTableRow(src_argb, dst_argb, width, luma, 0x00264b0f); + src_argb += src_stride_argb; + dst_argb += dst_stride_argb; + } + return 0; +} + +// Copy Alpha from one ARGB image to another. +LIBYUV_API +int ARGBCopyAlpha(const uint8* src_argb, int src_stride_argb, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { + int y; + void (*ARGBCopyAlphaRow)(const uint8* src_argb, uint8* dst_argb, int width) = + ARGBCopyAlphaRow_C; + if (!src_argb || !dst_argb || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } + // Coalesce rows. + if (src_stride_argb == width * 4 && + dst_stride_argb == width * 4) { + width *= height; + height = 1; + src_stride_argb = dst_stride_argb = 0; + } +#if defined(HAS_ARGBCOPYALPHAROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && + IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) && + IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16) && + IS_ALIGNED(width, 8)) { + ARGBCopyAlphaRow = ARGBCopyAlphaRow_SSE2; + } +#endif +#if defined(HAS_ARGBCOPYALPHAROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 16)) { + ARGBCopyAlphaRow = ARGBCopyAlphaRow_AVX2; + } +#endif + + for (y = 0; y < height; ++y) { + ARGBCopyAlphaRow(src_argb, dst_argb, width); + src_argb += src_stride_argb; + dst_argb += dst_stride_argb; + } + return 0; +} + +// Copy a planar Y channel to the alpha channel of a destination ARGB image. +LIBYUV_API +int ARGBCopyYToAlpha(const uint8* src_y, int src_stride_y, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { + int y; + void (*ARGBCopyYToAlphaRow)(const uint8* src_y, uint8* dst_argb, int width) = + ARGBCopyYToAlphaRow_C; + if (!src_y || !dst_argb || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_y = src_y + (height - 1) * src_stride_y; + src_stride_y = -src_stride_y; + } + // Coalesce rows. + if (src_stride_y == width && + dst_stride_argb == width * 4) { + width *= height; + height = 1; + src_stride_y = dst_stride_argb = 0; + } +#if defined(HAS_ARGBCOPYYTOALPHAROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && + IS_ALIGNED(src_y, 16) && IS_ALIGNED(src_stride_y, 16) && + IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16) && + IS_ALIGNED(width, 8)) { + ARGBCopyYToAlphaRow = ARGBCopyYToAlphaRow_SSE2; + } +#endif +#if defined(HAS_ARGBCOPYYTOALPHAROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 16)) { + ARGBCopyYToAlphaRow = ARGBCopyYToAlphaRow_AVX2; + } +#endif + + for (y = 0; y < height; ++y) { + ARGBCopyYToAlphaRow(src_y, dst_argb, width); + src_y += src_stride_y; + dst_argb += dst_stride_argb; + } + return 0; +} + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/TMessagesProj/jni/libyuv/source/rotate.cc b/TMessagesProj/jni/libyuv/source/rotate.cc new file mode 100644 index 000000000..fe0e72b13 --- /dev/null +++ b/TMessagesProj/jni/libyuv/source/rotate.cc @@ -0,0 +1,1315 @@ +/* + * Copyright 2011 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/rotate.h" + +#include "libyuv/cpu_id.h" +#include "libyuv/convert.h" +#include "libyuv/planar_functions.h" +#include "libyuv/row.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +#if !defined(LIBYUV_DISABLE_X86) && \ + (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) +#if defined(__APPLE__) && defined(__i386__) +#define DECLARE_FUNCTION(name) \ + ".text \n" \ + ".private_extern _" #name " \n" \ + ".align 4,0x90 \n" \ +"_" #name ": \n" +#elif defined(__MINGW32__) || defined(__CYGWIN__) && defined(__i386__) +#define DECLARE_FUNCTION(name) \ + ".text \n" \ + ".align 4,0x90 \n" \ +"_" #name ": \n" +#else +#define DECLARE_FUNCTION(name) \ + ".text \n" \ + ".align 4,0x90 \n" \ +#name ": \n" +#endif +#endif + +#if !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \ + (defined(__ARM_NEON__) || defined(LIBYUV_NEON)) +#define HAS_MIRRORROW_NEON +void MirrorRow_NEON(const uint8* src, uint8* dst, int width); +#define HAS_MIRRORROW_UV_NEON +void MirrorUVRow_NEON(const uint8* src, uint8* dst_a, uint8* dst_b, int width); +#define HAS_TRANSPOSE_WX8_NEON +void TransposeWx8_NEON(const uint8* src, int src_stride, + uint8* dst, int dst_stride, int width); +#define HAS_TRANSPOSE_UVWX8_NEON +void TransposeUVWx8_NEON(const uint8* src, int src_stride, + uint8* dst_a, int dst_stride_a, + uint8* dst_b, int dst_stride_b, + int width); +#elif !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \ + (defined(__aarch64__) || defined(LIBYUV_NEON)) +// #define HAS_MIRRORROW_NEON +// void MirrorRow_NEON(const uint8* src, uint8* dst, int width); +// #define HAS_MIRRORROW_UV_NEON +// void MirrorUVRow_NEON(const uint8* src, uint8* dst_a, uint8* dst_b, int width); +// #define HAS_TRANSPOSE_WX8_NEON +// void TransposeWx8_NEON(const uint8* src, int src_stride, +// uint8* dst, int dst_stride, int width); +// #define HAS_TRANSPOSE_UVWX8_NEON +// void TransposeUVWx8_NEON(const uint8* src, int src_stride, +// uint8* dst_a, int dst_stride_a, +// uint8* dst_b, int dst_stride_b, +// int width); +#endif // defined(__ARM_NEON__) + +#if !defined(LIBYUV_DISABLE_MIPS) && !defined(__native_client__) && \ + defined(__mips__) && \ + defined(__mips_dsp) && (__mips_dsp_rev >= 2) +#define HAS_TRANSPOSE_WX8_MIPS_DSPR2 +void TransposeWx8_MIPS_DSPR2(const uint8* src, int src_stride, + uint8* dst, int dst_stride, int width); + +void TransposeWx8_FAST_MIPS_DSPR2(const uint8* src, int src_stride, + uint8* dst, int dst_stride, int width); +#define HAS_TRANSPOSE_UVWx8_MIPS_DSPR2 +void TransposeUVWx8_MIPS_DSPR2(const uint8* src, int src_stride, + uint8* dst_a, int dst_stride_a, + uint8* dst_b, int dst_stride_b, + int width); +#endif // defined(__mips__) + +#if !defined(LIBYUV_DISABLE_X86) && \ + defined(_M_IX86) && defined(_MSC_VER) +#define HAS_TRANSPOSE_WX8_SSSE3 +__declspec(naked) __declspec(align(16)) +static void TransposeWx8_SSSE3(const uint8* src, int src_stride, + uint8* dst, int dst_stride, int width) { + __asm { + push edi + push esi + push ebp + mov eax, [esp + 12 + 4] // src + mov edi, [esp + 12 + 8] // src_stride + mov edx, [esp + 12 + 12] // dst + mov esi, [esp + 12 + 16] // dst_stride + mov ecx, [esp + 12 + 20] // width + + // Read in the data from the source pointer. + // First round of bit swap. + align 4 + convertloop: + movq xmm0, qword ptr [eax] + lea ebp, [eax + 8] + movq xmm1, qword ptr [eax + edi] + lea eax, [eax + 2 * edi] + punpcklbw xmm0, xmm1 + movq xmm2, qword ptr [eax] + movdqa xmm1, xmm0 + palignr xmm1, xmm1, 8 + movq xmm3, qword ptr [eax + edi] + lea eax, [eax + 2 * edi] + punpcklbw xmm2, xmm3 + movdqa xmm3, xmm2 + movq xmm4, qword ptr [eax] + palignr xmm3, xmm3, 8 + movq xmm5, qword ptr [eax + edi] + punpcklbw xmm4, xmm5 + lea eax, [eax + 2 * edi] + movdqa xmm5, xmm4 + movq xmm6, qword ptr [eax] + palignr xmm5, xmm5, 8 + movq xmm7, qword ptr [eax + edi] + punpcklbw xmm6, xmm7 + mov eax, ebp + movdqa xmm7, xmm6 + palignr xmm7, xmm7, 8 + // Second round of bit swap. + punpcklwd xmm0, xmm2 + punpcklwd xmm1, xmm3 + movdqa xmm2, xmm0 + movdqa xmm3, xmm1 + palignr xmm2, xmm2, 8 + palignr xmm3, xmm3, 8 + punpcklwd xmm4, xmm6 + punpcklwd xmm5, xmm7 + movdqa xmm6, xmm4 + movdqa xmm7, xmm5 + palignr xmm6, xmm6, 8 + palignr xmm7, xmm7, 8 + // Third round of bit swap. + // Write to the destination pointer. + punpckldq xmm0, xmm4 + movq qword ptr [edx], xmm0 + movdqa xmm4, xmm0 + palignr xmm4, xmm4, 8 + movq qword ptr [edx + esi], xmm4 + lea edx, [edx + 2 * esi] + punpckldq xmm2, xmm6 + movdqa xmm6, xmm2 + palignr xmm6, xmm6, 8 + movq qword ptr [edx], xmm2 + punpckldq xmm1, xmm5 + movq qword ptr [edx + esi], xmm6 + lea edx, [edx + 2 * esi] + movdqa xmm5, xmm1 + movq qword ptr [edx], xmm1 + palignr xmm5, xmm5, 8 + punpckldq xmm3, xmm7 + movq qword ptr [edx + esi], xmm5 + lea edx, [edx + 2 * esi] + movq qword ptr [edx], xmm3 + movdqa xmm7, xmm3 + palignr xmm7, xmm7, 8 + sub ecx, 8 + movq qword ptr [edx + esi], xmm7 + lea edx, [edx + 2 * esi] + jg convertloop + + pop ebp + pop esi + pop edi + ret + } +} + +#define HAS_TRANSPOSE_UVWX8_SSE2 +__declspec(naked) __declspec(align(16)) +static void TransposeUVWx8_SSE2(const uint8* src, int src_stride, + uint8* dst_a, int dst_stride_a, + uint8* dst_b, int dst_stride_b, + int w) { + __asm { + push ebx + push esi + push edi + push ebp + mov eax, [esp + 16 + 4] // src + mov edi, [esp + 16 + 8] // src_stride + mov edx, [esp + 16 + 12] // dst_a + mov esi, [esp + 16 + 16] // dst_stride_a + mov ebx, [esp + 16 + 20] // dst_b + mov ebp, [esp + 16 + 24] // dst_stride_b + mov ecx, esp + sub esp, 4 + 16 + and esp, ~15 + mov [esp + 16], ecx + mov ecx, [ecx + 16 + 28] // w + + align 4 + convertloop: + // Read in the data from the source pointer. + // First round of bit swap. + movdqa xmm0, [eax] + movdqa xmm1, [eax + edi] + lea eax, [eax + 2 * edi] + movdqa xmm7, xmm0 // use xmm7 as temp register. + punpcklbw xmm0, xmm1 + punpckhbw xmm7, xmm1 + movdqa xmm1, xmm7 + movdqa xmm2, [eax] + movdqa xmm3, [eax + edi] + lea eax, [eax + 2 * edi] + movdqa xmm7, xmm2 + punpcklbw xmm2, xmm3 + punpckhbw xmm7, xmm3 + movdqa xmm3, xmm7 + movdqa xmm4, [eax] + movdqa xmm5, [eax + edi] + lea eax, [eax + 2 * edi] + movdqa xmm7, xmm4 + punpcklbw xmm4, xmm5 + punpckhbw xmm7, xmm5 + movdqa xmm5, xmm7 + movdqa xmm6, [eax] + movdqa xmm7, [eax + edi] + lea eax, [eax + 2 * edi] + movdqa [esp], xmm5 // backup xmm5 + neg edi + movdqa xmm5, xmm6 // use xmm5 as temp register. + punpcklbw xmm6, xmm7 + punpckhbw xmm5, xmm7 + movdqa xmm7, xmm5 + lea eax, [eax + 8 * edi + 16] + neg edi + // Second round of bit swap. + movdqa xmm5, xmm0 + punpcklwd xmm0, xmm2 + punpckhwd xmm5, xmm2 + movdqa xmm2, xmm5 + movdqa xmm5, xmm1 + punpcklwd xmm1, xmm3 + punpckhwd xmm5, xmm3 + movdqa xmm3, xmm5 + movdqa xmm5, xmm4 + punpcklwd xmm4, xmm6 + punpckhwd xmm5, xmm6 + movdqa xmm6, xmm5 + movdqa xmm5, [esp] // restore xmm5 + movdqa [esp], xmm6 // backup xmm6 + movdqa xmm6, xmm5 // use xmm6 as temp register. + punpcklwd xmm5, xmm7 + punpckhwd xmm6, xmm7 + movdqa xmm7, xmm6 + // Third round of bit swap. + // Write to the destination pointer. + movdqa xmm6, xmm0 + punpckldq xmm0, xmm4 + punpckhdq xmm6, xmm4 + movdqa xmm4, xmm6 + movdqa xmm6, [esp] // restore xmm6 + movlpd qword ptr [edx], xmm0 + movhpd qword ptr [ebx], xmm0 + movlpd qword ptr [edx + esi], xmm4 + lea edx, [edx + 2 * esi] + movhpd qword ptr [ebx + ebp], xmm4 + lea ebx, [ebx + 2 * ebp] + movdqa xmm0, xmm2 // use xmm0 as the temp register. + punpckldq xmm2, xmm6 + movlpd qword ptr [edx], xmm2 + movhpd qword ptr [ebx], xmm2 + punpckhdq xmm0, xmm6 + movlpd qword ptr [edx + esi], xmm0 + lea edx, [edx + 2 * esi] + movhpd qword ptr [ebx + ebp], xmm0 + lea ebx, [ebx + 2 * ebp] + movdqa xmm0, xmm1 // use xmm0 as the temp register. + punpckldq xmm1, xmm5 + movlpd qword ptr [edx], xmm1 + movhpd qword ptr [ebx], xmm1 + punpckhdq xmm0, xmm5 + movlpd qword ptr [edx + esi], xmm0 + lea edx, [edx + 2 * esi] + movhpd qword ptr [ebx + ebp], xmm0 + lea ebx, [ebx + 2 * ebp] + movdqa xmm0, xmm3 // use xmm0 as the temp register. + punpckldq xmm3, xmm7 + movlpd qword ptr [edx], xmm3 + movhpd qword ptr [ebx], xmm3 + punpckhdq xmm0, xmm7 + sub ecx, 8 + movlpd qword ptr [edx + esi], xmm0 + lea edx, [edx + 2 * esi] + movhpd qword ptr [ebx + ebp], xmm0 + lea ebx, [ebx + 2 * ebp] + jg convertloop + + mov esp, [esp + 16] + pop ebp + pop edi + pop esi + pop ebx + ret + } +} +#elif !defined(LIBYUV_DISABLE_X86) && \ + (defined(__i386__) || (defined(__x86_64__) && !defined(__native_client__))) +#define HAS_TRANSPOSE_WX8_SSSE3 +static void TransposeWx8_SSSE3(const uint8* src, int src_stride, + uint8* dst, int dst_stride, int width) { + asm volatile ( + // Read in the data from the source pointer. + // First round of bit swap. + ".p2align 2 \n" + "1: \n" + "movq (%0),%%xmm0 \n" + "movq (%0,%3),%%xmm1 \n" + "lea (%0,%3,2),%0 \n" + "punpcklbw %%xmm1,%%xmm0 \n" + "movq (%0),%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "palignr $0x8,%%xmm1,%%xmm1 \n" + "movq (%0,%3),%%xmm3 \n" + "lea (%0,%3,2),%0 \n" + "punpcklbw %%xmm3,%%xmm2 \n" + "movdqa %%xmm2,%%xmm3 \n" + "movq (%0),%%xmm4 \n" + "palignr $0x8,%%xmm3,%%xmm3 \n" + "movq (%0,%3),%%xmm5 \n" + "lea (%0,%3,2),%0 \n" + "punpcklbw %%xmm5,%%xmm4 \n" + "movdqa %%xmm4,%%xmm5 \n" + "movq (%0),%%xmm6 \n" + "palignr $0x8,%%xmm5,%%xmm5 \n" + "movq (%0,%3),%%xmm7 \n" + "lea (%0,%3,2),%0 \n" + "punpcklbw %%xmm7,%%xmm6 \n" + "neg %3 \n" + "movdqa %%xmm6,%%xmm7 \n" + "lea 0x8(%0,%3,8),%0 \n" + "palignr $0x8,%%xmm7,%%xmm7 \n" + "neg %3 \n" + // Second round of bit swap. + "punpcklwd %%xmm2,%%xmm0 \n" + "punpcklwd %%xmm3,%%xmm1 \n" + "movdqa %%xmm0,%%xmm2 \n" + "movdqa %%xmm1,%%xmm3 \n" + "palignr $0x8,%%xmm2,%%xmm2 \n" + "palignr $0x8,%%xmm3,%%xmm3 \n" + "punpcklwd %%xmm6,%%xmm4 \n" + "punpcklwd %%xmm7,%%xmm5 \n" + "movdqa %%xmm4,%%xmm6 \n" + "movdqa %%xmm5,%%xmm7 \n" + "palignr $0x8,%%xmm6,%%xmm6 \n" + "palignr $0x8,%%xmm7,%%xmm7 \n" + // Third round of bit swap. + // Write to the destination pointer. + "punpckldq %%xmm4,%%xmm0 \n" + "movq %%xmm0,(%1) \n" + "movdqa %%xmm0,%%xmm4 \n" + "palignr $0x8,%%xmm4,%%xmm4 \n" + "movq %%xmm4,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "punpckldq %%xmm6,%%xmm2 \n" + "movdqa %%xmm2,%%xmm6 \n" + "movq %%xmm2,(%1) \n" + "palignr $0x8,%%xmm6,%%xmm6 \n" + "punpckldq %%xmm5,%%xmm1 \n" + "movq %%xmm6,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "movdqa %%xmm1,%%xmm5 \n" + "movq %%xmm1,(%1) \n" + "palignr $0x8,%%xmm5,%%xmm5 \n" + "movq %%xmm5,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "punpckldq %%xmm7,%%xmm3 \n" + "movq %%xmm3,(%1) \n" + "movdqa %%xmm3,%%xmm7 \n" + "palignr $0x8,%%xmm7,%%xmm7 \n" + "sub $0x8,%2 \n" + "movq %%xmm7,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "r"((intptr_t)(src_stride)), // %3 + "r"((intptr_t)(dst_stride)) // %4 + : "memory", "cc" + #if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" + #endif + ); +} + +#if !defined(LIBYUV_DISABLE_X86) && defined(__i386__) +#define HAS_TRANSPOSE_UVWX8_SSE2 +void TransposeUVWx8_SSE2(const uint8* src, int src_stride, + uint8* dst_a, int dst_stride_a, + uint8* dst_b, int dst_stride_b, + int w); + asm ( + DECLARE_FUNCTION(TransposeUVWx8_SSE2) + "push %ebx \n" + "push %esi \n" + "push %edi \n" + "push %ebp \n" + "mov 0x14(%esp),%eax \n" + "mov 0x18(%esp),%edi \n" + "mov 0x1c(%esp),%edx \n" + "mov 0x20(%esp),%esi \n" + "mov 0x24(%esp),%ebx \n" + "mov 0x28(%esp),%ebp \n" + "mov %esp,%ecx \n" + "sub $0x14,%esp \n" + "and $0xfffffff0,%esp \n" + "mov %ecx,0x10(%esp) \n" + "mov 0x2c(%ecx),%ecx \n" + +"1: \n" + "movdqa (%eax),%xmm0 \n" + "movdqa (%eax,%edi,1),%xmm1 \n" + "lea (%eax,%edi,2),%eax \n" + "movdqa %xmm0,%xmm7 \n" + "punpcklbw %xmm1,%xmm0 \n" + "punpckhbw %xmm1,%xmm7 \n" + "movdqa %xmm7,%xmm1 \n" + "movdqa (%eax),%xmm2 \n" + "movdqa (%eax,%edi,1),%xmm3 \n" + "lea (%eax,%edi,2),%eax \n" + "movdqa %xmm2,%xmm7 \n" + "punpcklbw %xmm3,%xmm2 \n" + "punpckhbw %xmm3,%xmm7 \n" + "movdqa %xmm7,%xmm3 \n" + "movdqa (%eax),%xmm4 \n" + "movdqa (%eax,%edi,1),%xmm5 \n" + "lea (%eax,%edi,2),%eax \n" + "movdqa %xmm4,%xmm7 \n" + "punpcklbw %xmm5,%xmm4 \n" + "punpckhbw %xmm5,%xmm7 \n" + "movdqa %xmm7,%xmm5 \n" + "movdqa (%eax),%xmm6 \n" + "movdqa (%eax,%edi,1),%xmm7 \n" + "lea (%eax,%edi,2),%eax \n" + "movdqa %xmm5,(%esp) \n" + "neg %edi \n" + "movdqa %xmm6,%xmm5 \n" + "punpcklbw %xmm7,%xmm6 \n" + "punpckhbw %xmm7,%xmm5 \n" + "movdqa %xmm5,%xmm7 \n" + "lea 0x10(%eax,%edi,8),%eax \n" + "neg %edi \n" + "movdqa %xmm0,%xmm5 \n" + "punpcklwd %xmm2,%xmm0 \n" + "punpckhwd %xmm2,%xmm5 \n" + "movdqa %xmm5,%xmm2 \n" + "movdqa %xmm1,%xmm5 \n" + "punpcklwd %xmm3,%xmm1 \n" + "punpckhwd %xmm3,%xmm5 \n" + "movdqa %xmm5,%xmm3 \n" + "movdqa %xmm4,%xmm5 \n" + "punpcklwd %xmm6,%xmm4 \n" + "punpckhwd %xmm6,%xmm5 \n" + "movdqa %xmm5,%xmm6 \n" + "movdqa (%esp),%xmm5 \n" + "movdqa %xmm6,(%esp) \n" + "movdqa %xmm5,%xmm6 \n" + "punpcklwd %xmm7,%xmm5 \n" + "punpckhwd %xmm7,%xmm6 \n" + "movdqa %xmm6,%xmm7 \n" + "movdqa %xmm0,%xmm6 \n" + "punpckldq %xmm4,%xmm0 \n" + "punpckhdq %xmm4,%xmm6 \n" + "movdqa %xmm6,%xmm4 \n" + "movdqa (%esp),%xmm6 \n" + "movlpd %xmm0,(%edx) \n" + "movhpd %xmm0,(%ebx) \n" + "movlpd %xmm4,(%edx,%esi,1) \n" + "lea (%edx,%esi,2),%edx \n" + "movhpd %xmm4,(%ebx,%ebp,1) \n" + "lea (%ebx,%ebp,2),%ebx \n" + "movdqa %xmm2,%xmm0 \n" + "punpckldq %xmm6,%xmm2 \n" + "movlpd %xmm2,(%edx) \n" + "movhpd %xmm2,(%ebx) \n" + "punpckhdq %xmm6,%xmm0 \n" + "movlpd %xmm0,(%edx,%esi,1) \n" + "lea (%edx,%esi,2),%edx \n" + "movhpd %xmm0,(%ebx,%ebp,1) \n" + "lea (%ebx,%ebp,2),%ebx \n" + "movdqa %xmm1,%xmm0 \n" + "punpckldq %xmm5,%xmm1 \n" + "movlpd %xmm1,(%edx) \n" + "movhpd %xmm1,(%ebx) \n" + "punpckhdq %xmm5,%xmm0 \n" + "movlpd %xmm0,(%edx,%esi,1) \n" + "lea (%edx,%esi,2),%edx \n" + "movhpd %xmm0,(%ebx,%ebp,1) \n" + "lea (%ebx,%ebp,2),%ebx \n" + "movdqa %xmm3,%xmm0 \n" + "punpckldq %xmm7,%xmm3 \n" + "movlpd %xmm3,(%edx) \n" + "movhpd %xmm3,(%ebx) \n" + "punpckhdq %xmm7,%xmm0 \n" + "sub $0x8,%ecx \n" + "movlpd %xmm0,(%edx,%esi,1) \n" + "lea (%edx,%esi,2),%edx \n" + "movhpd %xmm0,(%ebx,%ebp,1) \n" + "lea (%ebx,%ebp,2),%ebx \n" + "jg 1b \n" + "mov 0x10(%esp),%esp \n" + "pop %ebp \n" + "pop %edi \n" + "pop %esi \n" + "pop %ebx \n" +#if defined(__native_client__) + "pop %ecx \n" + "and $0xffffffe0,%ecx \n" + "jmp *%ecx \n" +#else + "ret \n" +#endif +); +#elif !defined(LIBYUV_DISABLE_X86) && !defined(__native_client__) && \ + defined(__x86_64__) +// 64 bit version has enough registers to do 16x8 to 8x16 at a time. +#define HAS_TRANSPOSE_WX8_FAST_SSSE3 +static void TransposeWx8_FAST_SSSE3(const uint8* src, int src_stride, + uint8* dst, int dst_stride, int width) { + asm volatile ( + // Read in the data from the source pointer. + // First round of bit swap. + ".p2align 2 \n" +"1: \n" + "movdqa (%0),%%xmm0 \n" + "movdqa (%0,%3),%%xmm1 \n" + "lea (%0,%3,2),%0 \n" + "movdqa %%xmm0,%%xmm8 \n" + "punpcklbw %%xmm1,%%xmm0 \n" + "punpckhbw %%xmm1,%%xmm8 \n" + "movdqa (%0),%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm8,%%xmm9 \n" + "palignr $0x8,%%xmm1,%%xmm1 \n" + "palignr $0x8,%%xmm9,%%xmm9 \n" + "movdqa (%0,%3),%%xmm3 \n" + "lea (%0,%3,2),%0 \n" + "movdqa %%xmm2,%%xmm10 \n" + "punpcklbw %%xmm3,%%xmm2 \n" + "punpckhbw %%xmm3,%%xmm10 \n" + "movdqa %%xmm2,%%xmm3 \n" + "movdqa %%xmm10,%%xmm11 \n" + "movdqa (%0),%%xmm4 \n" + "palignr $0x8,%%xmm3,%%xmm3 \n" + "palignr $0x8,%%xmm11,%%xmm11 \n" + "movdqa (%0,%3),%%xmm5 \n" + "lea (%0,%3,2),%0 \n" + "movdqa %%xmm4,%%xmm12 \n" + "punpcklbw %%xmm5,%%xmm4 \n" + "punpckhbw %%xmm5,%%xmm12 \n" + "movdqa %%xmm4,%%xmm5 \n" + "movdqa %%xmm12,%%xmm13 \n" + "movdqa (%0),%%xmm6 \n" + "palignr $0x8,%%xmm5,%%xmm5 \n" + "palignr $0x8,%%xmm13,%%xmm13 \n" + "movdqa (%0,%3),%%xmm7 \n" + "lea (%0,%3,2),%0 \n" + "movdqa %%xmm6,%%xmm14 \n" + "punpcklbw %%xmm7,%%xmm6 \n" + "punpckhbw %%xmm7,%%xmm14 \n" + "neg %3 \n" + "movdqa %%xmm6,%%xmm7 \n" + "movdqa %%xmm14,%%xmm15 \n" + "lea 0x10(%0,%3,8),%0 \n" + "palignr $0x8,%%xmm7,%%xmm7 \n" + "palignr $0x8,%%xmm15,%%xmm15 \n" + "neg %3 \n" + // Second round of bit swap. + "punpcklwd %%xmm2,%%xmm0 \n" + "punpcklwd %%xmm3,%%xmm1 \n" + "movdqa %%xmm0,%%xmm2 \n" + "movdqa %%xmm1,%%xmm3 \n" + "palignr $0x8,%%xmm2,%%xmm2 \n" + "palignr $0x8,%%xmm3,%%xmm3 \n" + "punpcklwd %%xmm6,%%xmm4 \n" + "punpcklwd %%xmm7,%%xmm5 \n" + "movdqa %%xmm4,%%xmm6 \n" + "movdqa %%xmm5,%%xmm7 \n" + "palignr $0x8,%%xmm6,%%xmm6 \n" + "palignr $0x8,%%xmm7,%%xmm7 \n" + "punpcklwd %%xmm10,%%xmm8 \n" + "punpcklwd %%xmm11,%%xmm9 \n" + "movdqa %%xmm8,%%xmm10 \n" + "movdqa %%xmm9,%%xmm11 \n" + "palignr $0x8,%%xmm10,%%xmm10 \n" + "palignr $0x8,%%xmm11,%%xmm11 \n" + "punpcklwd %%xmm14,%%xmm12 \n" + "punpcklwd %%xmm15,%%xmm13 \n" + "movdqa %%xmm12,%%xmm14 \n" + "movdqa %%xmm13,%%xmm15 \n" + "palignr $0x8,%%xmm14,%%xmm14 \n" + "palignr $0x8,%%xmm15,%%xmm15 \n" + // Third round of bit swap. + // Write to the destination pointer. + "punpckldq %%xmm4,%%xmm0 \n" + "movq %%xmm0,(%1) \n" + "movdqa %%xmm0,%%xmm4 \n" + "palignr $0x8,%%xmm4,%%xmm4 \n" + "movq %%xmm4,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "punpckldq %%xmm6,%%xmm2 \n" + "movdqa %%xmm2,%%xmm6 \n" + "movq %%xmm2,(%1) \n" + "palignr $0x8,%%xmm6,%%xmm6 \n" + "punpckldq %%xmm5,%%xmm1 \n" + "movq %%xmm6,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "movdqa %%xmm1,%%xmm5 \n" + "movq %%xmm1,(%1) \n" + "palignr $0x8,%%xmm5,%%xmm5 \n" + "movq %%xmm5,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "punpckldq %%xmm7,%%xmm3 \n" + "movq %%xmm3,(%1) \n" + "movdqa %%xmm3,%%xmm7 \n" + "palignr $0x8,%%xmm7,%%xmm7 \n" + "movq %%xmm7,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "punpckldq %%xmm12,%%xmm8 \n" + "movq %%xmm8,(%1) \n" + "movdqa %%xmm8,%%xmm12 \n" + "palignr $0x8,%%xmm12,%%xmm12 \n" + "movq %%xmm12,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "punpckldq %%xmm14,%%xmm10 \n" + "movdqa %%xmm10,%%xmm14 \n" + "movq %%xmm10,(%1) \n" + "palignr $0x8,%%xmm14,%%xmm14 \n" + "punpckldq %%xmm13,%%xmm9 \n" + "movq %%xmm14,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "movdqa %%xmm9,%%xmm13 \n" + "movq %%xmm9,(%1) \n" + "palignr $0x8,%%xmm13,%%xmm13 \n" + "movq %%xmm13,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "punpckldq %%xmm15,%%xmm11 \n" + "movq %%xmm11,(%1) \n" + "movdqa %%xmm11,%%xmm15 \n" + "palignr $0x8,%%xmm15,%%xmm15 \n" + "sub $0x10,%2 \n" + "movq %%xmm15,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "r"((intptr_t)(src_stride)), // %3 + "r"((intptr_t)(dst_stride)) // %4 + : "memory", "cc", + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", + "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15" +); +} + +#define HAS_TRANSPOSE_UVWX8_SSE2 +static void TransposeUVWx8_SSE2(const uint8* src, int src_stride, + uint8* dst_a, int dst_stride_a, + uint8* dst_b, int dst_stride_b, + int w) { + asm volatile ( + // Read in the data from the source pointer. + // First round of bit swap. + ".p2align 2 \n" +"1: \n" + "movdqa (%0),%%xmm0 \n" + "movdqa (%0,%4),%%xmm1 \n" + "lea (%0,%4,2),%0 \n" + "movdqa %%xmm0,%%xmm8 \n" + "punpcklbw %%xmm1,%%xmm0 \n" + "punpckhbw %%xmm1,%%xmm8 \n" + "movdqa %%xmm8,%%xmm1 \n" + "movdqa (%0),%%xmm2 \n" + "movdqa (%0,%4),%%xmm3 \n" + "lea (%0,%4,2),%0 \n" + "movdqa %%xmm2,%%xmm8 \n" + "punpcklbw %%xmm3,%%xmm2 \n" + "punpckhbw %%xmm3,%%xmm8 \n" + "movdqa %%xmm8,%%xmm3 \n" + "movdqa (%0),%%xmm4 \n" + "movdqa (%0,%4),%%xmm5 \n" + "lea (%0,%4,2),%0 \n" + "movdqa %%xmm4,%%xmm8 \n" + "punpcklbw %%xmm5,%%xmm4 \n" + "punpckhbw %%xmm5,%%xmm8 \n" + "movdqa %%xmm8,%%xmm5 \n" + "movdqa (%0),%%xmm6 \n" + "movdqa (%0,%4),%%xmm7 \n" + "lea (%0,%4,2),%0 \n" + "movdqa %%xmm6,%%xmm8 \n" + "punpcklbw %%xmm7,%%xmm6 \n" + "neg %4 \n" + "lea 0x10(%0,%4,8),%0 \n" + "punpckhbw %%xmm7,%%xmm8 \n" + "movdqa %%xmm8,%%xmm7 \n" + "neg %4 \n" + // Second round of bit swap. + "movdqa %%xmm0,%%xmm8 \n" + "movdqa %%xmm1,%%xmm9 \n" + "punpckhwd %%xmm2,%%xmm8 \n" + "punpckhwd %%xmm3,%%xmm9 \n" + "punpcklwd %%xmm2,%%xmm0 \n" + "punpcklwd %%xmm3,%%xmm1 \n" + "movdqa %%xmm8,%%xmm2 \n" + "movdqa %%xmm9,%%xmm3 \n" + "movdqa %%xmm4,%%xmm8 \n" + "movdqa %%xmm5,%%xmm9 \n" + "punpckhwd %%xmm6,%%xmm8 \n" + "punpckhwd %%xmm7,%%xmm9 \n" + "punpcklwd %%xmm6,%%xmm4 \n" + "punpcklwd %%xmm7,%%xmm5 \n" + "movdqa %%xmm8,%%xmm6 \n" + "movdqa %%xmm9,%%xmm7 \n" + // Third round of bit swap. + // Write to the destination pointer. + "movdqa %%xmm0,%%xmm8 \n" + "punpckldq %%xmm4,%%xmm0 \n" + "movlpd %%xmm0,(%1) \n" // Write back U channel + "movhpd %%xmm0,(%2) \n" // Write back V channel + "punpckhdq %%xmm4,%%xmm8 \n" + "movlpd %%xmm8,(%1,%5) \n" + "lea (%1,%5,2),%1 \n" + "movhpd %%xmm8,(%2,%6) \n" + "lea (%2,%6,2),%2 \n" + "movdqa %%xmm2,%%xmm8 \n" + "punpckldq %%xmm6,%%xmm2 \n" + "movlpd %%xmm2,(%1) \n" + "movhpd %%xmm2,(%2) \n" + "punpckhdq %%xmm6,%%xmm8 \n" + "movlpd %%xmm8,(%1,%5) \n" + "lea (%1,%5,2),%1 \n" + "movhpd %%xmm8,(%2,%6) \n" + "lea (%2,%6,2),%2 \n" + "movdqa %%xmm1,%%xmm8 \n" + "punpckldq %%xmm5,%%xmm1 \n" + "movlpd %%xmm1,(%1) \n" + "movhpd %%xmm1,(%2) \n" + "punpckhdq %%xmm5,%%xmm8 \n" + "movlpd %%xmm8,(%1,%5) \n" + "lea (%1,%5,2),%1 \n" + "movhpd %%xmm8,(%2,%6) \n" + "lea (%2,%6,2),%2 \n" + "movdqa %%xmm3,%%xmm8 \n" + "punpckldq %%xmm7,%%xmm3 \n" + "movlpd %%xmm3,(%1) \n" + "movhpd %%xmm3,(%2) \n" + "punpckhdq %%xmm7,%%xmm8 \n" + "sub $0x8,%3 \n" + "movlpd %%xmm8,(%1,%5) \n" + "lea (%1,%5,2),%1 \n" + "movhpd %%xmm8,(%2,%6) \n" + "lea (%2,%6,2),%2 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst_a), // %1 + "+r"(dst_b), // %2 + "+r"(w) // %3 + : "r"((intptr_t)(src_stride)), // %4 + "r"((intptr_t)(dst_stride_a)), // %5 + "r"((intptr_t)(dst_stride_b)) // %6 + : "memory", "cc", + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", + "xmm8", "xmm9" +); +} +#endif +#endif + +static void TransposeWx8_C(const uint8* src, int src_stride, + uint8* dst, int dst_stride, + int width) { + int i; + for (i = 0; i < width; ++i) { + dst[0] = src[0 * src_stride]; + dst[1] = src[1 * src_stride]; + dst[2] = src[2 * src_stride]; + dst[3] = src[3 * src_stride]; + dst[4] = src[4 * src_stride]; + dst[5] = src[5 * src_stride]; + dst[6] = src[6 * src_stride]; + dst[7] = src[7 * src_stride]; + ++src; + dst += dst_stride; + } +} + +static void TransposeWxH_C(const uint8* src, int src_stride, + uint8* dst, int dst_stride, + int width, int height) { + int i; + for (i = 0; i < width; ++i) { + int j; + for (j = 0; j < height; ++j) { + dst[i * dst_stride + j] = src[j * src_stride + i]; + } + } +} + +LIBYUV_API +void TransposePlane(const uint8* src, int src_stride, + uint8* dst, int dst_stride, + int width, int height) { + int i = height; + void (*TransposeWx8)(const uint8* src, int src_stride, + uint8* dst, int dst_stride, + int width) = TransposeWx8_C; +#if defined(HAS_TRANSPOSE_WX8_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + TransposeWx8 = TransposeWx8_NEON; + } +#endif +#if defined(HAS_TRANSPOSE_WX8_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8)) { + TransposeWx8 = TransposeWx8_SSSE3; + } +#endif +#if defined(HAS_TRANSPOSE_WX8_FAST_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && + IS_ALIGNED(width, 16) && + IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16)) { + TransposeWx8 = TransposeWx8_FAST_SSSE3; + } +#endif +#if defined(HAS_TRANSPOSE_WX8_MIPS_DSPR2) + if (TestCpuFlag(kCpuHasMIPS_DSPR2)) { + if (IS_ALIGNED(width, 4) && + IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) { + TransposeWx8 = TransposeWx8_FAST_MIPS_DSPR2; + } else { + TransposeWx8 = TransposeWx8_MIPS_DSPR2; + } + } +#endif + + // Work across the source in 8x8 tiles + while (i >= 8) { + TransposeWx8(src, src_stride, dst, dst_stride, width); + src += 8 * src_stride; // Go down 8 rows. + dst += 8; // Move over 8 columns. + i -= 8; + } + + TransposeWxH_C(src, src_stride, dst, dst_stride, width, i); +} + +LIBYUV_API +void RotatePlane90(const uint8* src, int src_stride, + uint8* dst, int dst_stride, + int width, int height) { + // Rotate by 90 is a transpose with the source read + // from bottom to top. So set the source pointer to the end + // of the buffer and flip the sign of the source stride. + src += src_stride * (height - 1); + src_stride = -src_stride; + TransposePlane(src, src_stride, dst, dst_stride, width, height); +} + +LIBYUV_API +void RotatePlane270(const uint8* src, int src_stride, + uint8* dst, int dst_stride, + int width, int height) { + // Rotate by 270 is a transpose with the destination written + // from bottom to top. So set the destination pointer to the end + // of the buffer and flip the sign of the destination stride. + dst += dst_stride * (width - 1); + dst_stride = -dst_stride; + TransposePlane(src, src_stride, dst, dst_stride, width, height); +} + +LIBYUV_API +void RotatePlane180(const uint8* src, int src_stride, + uint8* dst, int dst_stride, + int width, int height) { + // Swap first and last row and mirror the content. Uses a temporary row. + align_buffer_64(row, width); + const uint8* src_bot = src + src_stride * (height - 1); + uint8* dst_bot = dst + dst_stride * (height - 1); + int half_height = (height + 1) >> 1; + int y; + void (*MirrorRow)(const uint8* src, uint8* dst, int width) = MirrorRow_C; + void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C; +#if defined(HAS_MIRRORROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16)) { + MirrorRow = MirrorRow_NEON; + } +#endif +#if defined(HAS_MIRRORROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16) && + IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16) && + IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) { + MirrorRow = MirrorRow_SSE2; + } +#endif +#if defined(HAS_MIRRORROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16) && + IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16) && + IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) { + MirrorRow = MirrorRow_SSSE3; + } +#endif +#if defined(HAS_MIRRORROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 32)) { + MirrorRow = MirrorRow_AVX2; + } +#endif +#if defined(HAS_MIRRORROW_MIPS_DSPR2) + if (TestCpuFlag(kCpuHasMIPS_DSPR2) && + IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4) && + IS_ALIGNED(dst, 4) && IS_ALIGNED(dst_stride, 4)) { + MirrorRow = MirrorRow_MIPS_DSPR2; + } +#endif +#if defined(HAS_COPYROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 32)) { + CopyRow = CopyRow_NEON; + } +#endif +#if defined(HAS_COPYROW_X86) + if (TestCpuFlag(kCpuHasX86) && IS_ALIGNED(width, 4)) { + CopyRow = CopyRow_X86; + } +#endif +#if defined(HAS_COPYROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32) && + IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16) && + IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) { + CopyRow = CopyRow_SSE2; + } +#endif +#if defined(HAS_COPYROW_ERMS) + if (TestCpuFlag(kCpuHasERMS)) { + CopyRow = CopyRow_ERMS; + } +#endif +#if defined(HAS_COPYROW_MIPS) + if (TestCpuFlag(kCpuHasMIPS)) { + CopyRow = CopyRow_MIPS; + } +#endif + + // Odd height will harmlessly mirror the middle row twice. + for (y = 0; y < half_height; ++y) { + MirrorRow(src, row, width); // Mirror first row into a buffer + src += src_stride; + MirrorRow(src_bot, dst, width); // Mirror last row into first row + dst += dst_stride; + CopyRow(row, dst_bot, width); // Copy first mirrored row into last + src_bot -= src_stride; + dst_bot -= dst_stride; + } + free_aligned_buffer_64(row); +} + +static void TransposeUVWx8_C(const uint8* src, int src_stride, + uint8* dst_a, int dst_stride_a, + uint8* dst_b, int dst_stride_b, + int width) { + int i; + for (i = 0; i < width; ++i) { + dst_a[0] = src[0 * src_stride + 0]; + dst_b[0] = src[0 * src_stride + 1]; + dst_a[1] = src[1 * src_stride + 0]; + dst_b[1] = src[1 * src_stride + 1]; + dst_a[2] = src[2 * src_stride + 0]; + dst_b[2] = src[2 * src_stride + 1]; + dst_a[3] = src[3 * src_stride + 0]; + dst_b[3] = src[3 * src_stride + 1]; + dst_a[4] = src[4 * src_stride + 0]; + dst_b[4] = src[4 * src_stride + 1]; + dst_a[5] = src[5 * src_stride + 0]; + dst_b[5] = src[5 * src_stride + 1]; + dst_a[6] = src[6 * src_stride + 0]; + dst_b[6] = src[6 * src_stride + 1]; + dst_a[7] = src[7 * src_stride + 0]; + dst_b[7] = src[7 * src_stride + 1]; + src += 2; + dst_a += dst_stride_a; + dst_b += dst_stride_b; + } +} + +static void TransposeUVWxH_C(const uint8* src, int src_stride, + uint8* dst_a, int dst_stride_a, + uint8* dst_b, int dst_stride_b, + int width, int height) { + int i; + for (i = 0; i < width * 2; i += 2) { + int j; + for (j = 0; j < height; ++j) { + dst_a[j + ((i >> 1) * dst_stride_a)] = src[i + (j * src_stride)]; + dst_b[j + ((i >> 1) * dst_stride_b)] = src[i + (j * src_stride) + 1]; + } + } +} + +LIBYUV_API +void TransposeUV(const uint8* src, int src_stride, + uint8* dst_a, int dst_stride_a, + uint8* dst_b, int dst_stride_b, + int width, int height) { + int i = height; + void (*TransposeUVWx8)(const uint8* src, int src_stride, + uint8* dst_a, int dst_stride_a, + uint8* dst_b, int dst_stride_b, + int width) = TransposeUVWx8_C; +#if defined(HAS_TRANSPOSE_UVWX8_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + TransposeUVWx8 = TransposeUVWx8_NEON; + } +#elif defined(HAS_TRANSPOSE_UVWX8_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && + IS_ALIGNED(width, 8) && + IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16)) { + TransposeUVWx8 = TransposeUVWx8_SSE2; + } +#elif defined(HAS_TRANSPOSE_UVWx8_MIPS_DSPR2) + if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 2) && + IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) { + TransposeUVWx8 = TransposeUVWx8_MIPS_DSPR2; + } +#endif + + // Work through the source in 8x8 tiles. + while (i >= 8) { + TransposeUVWx8(src, src_stride, + dst_a, dst_stride_a, + dst_b, dst_stride_b, + width); + src += 8 * src_stride; // Go down 8 rows. + dst_a += 8; // Move over 8 columns. + dst_b += 8; // Move over 8 columns. + i -= 8; + } + + TransposeUVWxH_C(src, src_stride, + dst_a, dst_stride_a, + dst_b, dst_stride_b, + width, i); +} + +LIBYUV_API +void RotateUV90(const uint8* src, int src_stride, + uint8* dst_a, int dst_stride_a, + uint8* dst_b, int dst_stride_b, + int width, int height) { + src += src_stride * (height - 1); + src_stride = -src_stride; + + TransposeUV(src, src_stride, + dst_a, dst_stride_a, + dst_b, dst_stride_b, + width, height); +} + +LIBYUV_API +void RotateUV270(const uint8* src, int src_stride, + uint8* dst_a, int dst_stride_a, + uint8* dst_b, int dst_stride_b, + int width, int height) { + dst_a += dst_stride_a * (width - 1); + dst_b += dst_stride_b * (width - 1); + dst_stride_a = -dst_stride_a; + dst_stride_b = -dst_stride_b; + + TransposeUV(src, src_stride, + dst_a, dst_stride_a, + dst_b, dst_stride_b, + width, height); +} + +// Rotate 180 is a horizontal and vertical flip. +LIBYUV_API +void RotateUV180(const uint8* src, int src_stride, + uint8* dst_a, int dst_stride_a, + uint8* dst_b, int dst_stride_b, + int width, int height) { + int i; + void (*MirrorRowUV)(const uint8* src, uint8* dst_u, uint8* dst_v, int width) = + MirrorUVRow_C; +#if defined(HAS_MIRRORUVROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) { + MirrorRowUV = MirrorUVRow_NEON; + } +#elif defined(HAS_MIRRORROW_UV_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16) && + IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16)) { + MirrorRowUV = MirrorUVRow_SSSE3; + } +#elif defined(HAS_MIRRORUVROW_MIPS_DSPR2) + if (TestCpuFlag(kCpuHasMIPS_DSPR2) && + IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) { + MirrorRowUV = MirrorUVRow_MIPS_DSPR2; + } +#endif + + dst_a += dst_stride_a * (height - 1); + dst_b += dst_stride_b * (height - 1); + + for (i = 0; i < height; ++i) { + MirrorRowUV(src, dst_a, dst_b, width); + src += src_stride; + dst_a -= dst_stride_a; + dst_b -= dst_stride_b; + } +} + +LIBYUV_API +int RotatePlane(const uint8* src, int src_stride, + uint8* dst, int dst_stride, + int width, int height, + enum RotationMode mode) { + if (!src || width <= 0 || height == 0 || !dst) { + return -1; + } + + // Negative height means invert the image. + if (height < 0) { + height = -height; + src = src + (height - 1) * src_stride; + src_stride = -src_stride; + } + + switch (mode) { + case kRotate0: + // copy frame + CopyPlane(src, src_stride, + dst, dst_stride, + width, height); + return 0; + case kRotate90: + RotatePlane90(src, src_stride, + dst, dst_stride, + width, height); + return 0; + case kRotate270: + RotatePlane270(src, src_stride, + dst, dst_stride, + width, height); + return 0; + case kRotate180: + RotatePlane180(src, src_stride, + dst, dst_stride, + width, height); + return 0; + default: + break; + } + return -1; +} + +LIBYUV_API +int I420Rotate(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height, + enum RotationMode mode) { + int halfwidth = (width + 1) >> 1; + int halfheight = (height + 1) >> 1; + if (!src_y || !src_u || !src_v || width <= 0 || height == 0 || + !dst_y || !dst_u || !dst_v) { + return -1; + } + + // Negative height means invert the image. + if (height < 0) { + height = -height; + halfheight = (height + 1) >> 1; + src_y = src_y + (height - 1) * src_stride_y; + src_u = src_u + (halfheight - 1) * src_stride_u; + src_v = src_v + (halfheight - 1) * src_stride_v; + src_stride_y = -src_stride_y; + src_stride_u = -src_stride_u; + src_stride_v = -src_stride_v; + } + + switch (mode) { + case kRotate0: + // copy frame + return I420Copy(src_y, src_stride_y, + src_u, src_stride_u, + src_v, src_stride_v, + dst_y, dst_stride_y, + dst_u, dst_stride_u, + dst_v, dst_stride_v, + width, height); + case kRotate90: + RotatePlane90(src_y, src_stride_y, + dst_y, dst_stride_y, + width, height); + RotatePlane90(src_u, src_stride_u, + dst_u, dst_stride_u, + halfwidth, halfheight); + RotatePlane90(src_v, src_stride_v, + dst_v, dst_stride_v, + halfwidth, halfheight); + return 0; + case kRotate270: + RotatePlane270(src_y, src_stride_y, + dst_y, dst_stride_y, + width, height); + RotatePlane270(src_u, src_stride_u, + dst_u, dst_stride_u, + halfwidth, halfheight); + RotatePlane270(src_v, src_stride_v, + dst_v, dst_stride_v, + halfwidth, halfheight); + return 0; + case kRotate180: + RotatePlane180(src_y, src_stride_y, + dst_y, dst_stride_y, + width, height); + RotatePlane180(src_u, src_stride_u, + dst_u, dst_stride_u, + halfwidth, halfheight); + RotatePlane180(src_v, src_stride_v, + dst_v, dst_stride_v, + halfwidth, halfheight); + return 0; + default: + break; + } + return -1; +} + +LIBYUV_API +int NV12ToI420Rotate(const uint8* src_y, int src_stride_y, + const uint8* src_uv, int src_stride_uv, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height, + enum RotationMode mode) { + int halfwidth = (width + 1) >> 1; + int halfheight = (height + 1) >> 1; + if (!src_y || !src_uv || width <= 0 || height == 0 || + !dst_y || !dst_u || !dst_v) { + return -1; + } + + // Negative height means invert the image. + if (height < 0) { + height = -height; + halfheight = (height + 1) >> 1; + src_y = src_y + (height - 1) * src_stride_y; + src_uv = src_uv + (halfheight - 1) * src_stride_uv; + src_stride_y = -src_stride_y; + src_stride_uv = -src_stride_uv; + } + + switch (mode) { + case kRotate0: + // copy frame + return NV12ToI420(src_y, src_stride_y, + src_uv, src_stride_uv, + dst_y, dst_stride_y, + dst_u, dst_stride_u, + dst_v, dst_stride_v, + width, height); + case kRotate90: + RotatePlane90(src_y, src_stride_y, + dst_y, dst_stride_y, + width, height); + RotateUV90(src_uv, src_stride_uv, + dst_u, dst_stride_u, + dst_v, dst_stride_v, + halfwidth, halfheight); + return 0; + case kRotate270: + RotatePlane270(src_y, src_stride_y, + dst_y, dst_stride_y, + width, height); + RotateUV270(src_uv, src_stride_uv, + dst_u, dst_stride_u, + dst_v, dst_stride_v, + halfwidth, halfheight); + return 0; + case kRotate180: + RotatePlane180(src_y, src_stride_y, + dst_y, dst_stride_y, + width, height); + RotateUV180(src_uv, src_stride_uv, + dst_u, dst_stride_u, + dst_v, dst_stride_v, + halfwidth, halfheight); + return 0; + default: + break; + } + return -1; +} + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/TMessagesProj/jni/libyuv/source/rotate_argb.cc b/TMessagesProj/jni/libyuv/source/rotate_argb.cc new file mode 100644 index 000000000..ab0f9ce07 --- /dev/null +++ b/TMessagesProj/jni/libyuv/source/rotate_argb.cc @@ -0,0 +1,209 @@ +/* + * Copyright 2012 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/rotate.h" + +#include "libyuv/cpu_id.h" +#include "libyuv/convert.h" +#include "libyuv/planar_functions.h" +#include "libyuv/row.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// ARGBScale has a function to copy pixels to a row, striding each source +// pixel by a constant. +#if !defined(LIBYUV_DISABLE_X86) && \ + (defined(_M_IX86) || \ + (defined(__x86_64__) && !defined(__native_client__)) || defined(__i386__)) +#define HAS_SCALEARGBROWDOWNEVEN_SSE2 +void ScaleARGBRowDownEven_SSE2(const uint8* src_ptr, int src_stride, + int src_stepx, + uint8* dst_ptr, int dst_width); +#endif +#if !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \ + (defined(__ARM_NEON__) || defined(LIBYUV_NEON)) +#define HAS_SCALEARGBROWDOWNEVEN_NEON +void ScaleARGBRowDownEven_NEON(const uint8* src_ptr, int src_stride, + int src_stepx, + uint8* dst_ptr, int dst_width); +#endif + +void ScaleARGBRowDownEven_C(const uint8* src_ptr, int, + int src_stepx, + uint8* dst_ptr, int dst_width); + +static void ARGBTranspose(const uint8* src, int src_stride, + uint8* dst, int dst_stride, + int width, int height) { + int i; + int src_pixel_step = src_stride >> 2; + void (*ScaleARGBRowDownEven)(const uint8* src_ptr, int src_stride, + int src_step, uint8* dst_ptr, int dst_width) = ScaleARGBRowDownEven_C; +#if defined(HAS_SCALEARGBROWDOWNEVEN_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(height, 4) && // Width of dest. + IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) { + ScaleARGBRowDownEven = ScaleARGBRowDownEven_SSE2; + } +#elif defined(HAS_SCALEARGBROWDOWNEVEN_NEON) + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(height, 4) && // Width of dest. + IS_ALIGNED(src, 4)) { + ScaleARGBRowDownEven = ScaleARGBRowDownEven_NEON; + } +#endif + + for (i = 0; i < width; ++i) { // column of source to row of dest. + ScaleARGBRowDownEven(src, 0, src_pixel_step, dst, height); + dst += dst_stride; + src += 4; + } +} + +void ARGBRotate90(const uint8* src, int src_stride, + uint8* dst, int dst_stride, + int width, int height) { + // Rotate by 90 is a ARGBTranspose with the source read + // from bottom to top. So set the source pointer to the end + // of the buffer and flip the sign of the source stride. + src += src_stride * (height - 1); + src_stride = -src_stride; + ARGBTranspose(src, src_stride, dst, dst_stride, width, height); +} + +void ARGBRotate270(const uint8* src, int src_stride, + uint8* dst, int dst_stride, + int width, int height) { + // Rotate by 270 is a ARGBTranspose with the destination written + // from bottom to top. So set the destination pointer to the end + // of the buffer and flip the sign of the destination stride. + dst += dst_stride * (width - 1); + dst_stride = -dst_stride; + ARGBTranspose(src, src_stride, dst, dst_stride, width, height); +} + +void ARGBRotate180(const uint8* src, int src_stride, + uint8* dst, int dst_stride, + int width, int height) { + // Swap first and last row and mirror the content. Uses a temporary row. + align_buffer_64(row, width * 4); + const uint8* src_bot = src + src_stride * (height - 1); + uint8* dst_bot = dst + dst_stride * (height - 1); + int half_height = (height + 1) >> 1; + int y; + void (*ARGBMirrorRow)(const uint8* src, uint8* dst, int width) = + ARGBMirrorRow_C; + void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C; +#if defined(HAS_ARGBMIRRORROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 4) && + IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16) && + IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) { + ARGBMirrorRow = ARGBMirrorRow_SSSE3; + } +#endif +#if defined(HAS_ARGBMIRRORROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 8)) { + ARGBMirrorRow = ARGBMirrorRow_AVX2; + } +#endif +#if defined(HAS_ARGBMIRRORROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 4)) { + ARGBMirrorRow = ARGBMirrorRow_NEON; + } +#endif +#if defined(HAS_COPYROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width * 4, 32)) { + CopyRow = CopyRow_NEON; + } +#endif +#if defined(HAS_COPYROW_X86) + if (TestCpuFlag(kCpuHasX86)) { + CopyRow = CopyRow_X86; + } +#endif +#if defined(HAS_COPYROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width * 4, 32) && + IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16) && + IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) { + CopyRow = CopyRow_SSE2; + } +#endif +#if defined(HAS_COPYROW_ERMS) + if (TestCpuFlag(kCpuHasERMS)) { + CopyRow = CopyRow_ERMS; + } +#endif +#if defined(HAS_COPYROW_MIPS) + if (TestCpuFlag(kCpuHasMIPS)) { + CopyRow = CopyRow_MIPS; + } +#endif + + // Odd height will harmlessly mirror the middle row twice. + for (y = 0; y < half_height; ++y) { + ARGBMirrorRow(src, row, width); // Mirror first row into a buffer + ARGBMirrorRow(src_bot, dst, width); // Mirror last row into first row + CopyRow(row, dst_bot, width * 4); // Copy first mirrored row into last + src += src_stride; + dst += dst_stride; + src_bot -= src_stride; + dst_bot -= dst_stride; + } + free_aligned_buffer_64(row); +} + +LIBYUV_API +int ARGBRotate(const uint8* src_argb, int src_stride_argb, + uint8* dst_argb, int dst_stride_argb, + int width, int height, + enum RotationMode mode) { + if (!src_argb || width <= 0 || height == 0 || !dst_argb) { + return -1; + } + + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } + + switch (mode) { + case kRotate0: + // copy frame + return ARGBCopy(src_argb, src_stride_argb, + dst_argb, dst_stride_argb, + width, height); + case kRotate90: + ARGBRotate90(src_argb, src_stride_argb, + dst_argb, dst_stride_argb, + width, height); + return 0; + case kRotate270: + ARGBRotate270(src_argb, src_stride_argb, + dst_argb, dst_stride_argb, + width, height); + return 0; + case kRotate180: + ARGBRotate180(src_argb, src_stride_argb, + dst_argb, dst_stride_argb, + width, height); + return 0; + default: + break; + } + return -1; +} + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/TMessagesProj/jni/libyuv/source/rotate_mips.cc b/TMessagesProj/jni/libyuv/source/rotate_mips.cc new file mode 100644 index 000000000..70770fd06 --- /dev/null +++ b/TMessagesProj/jni/libyuv/source/rotate_mips.cc @@ -0,0 +1,485 @@ +/* + * Copyright 2011 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/row.h" + +#include "libyuv/basic_types.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +#if !defined(LIBYUV_DISABLE_MIPS) && \ + defined(__mips_dsp) && (__mips_dsp_rev >= 2) && \ + (_MIPS_SIM == _MIPS_SIM_ABI32) + +void TransposeWx8_MIPS_DSPR2(const uint8* src, int src_stride, + uint8* dst, int dst_stride, + int width) { + __asm__ __volatile__ ( + ".set push \n" + ".set noreorder \n" + "sll $t2, %[src_stride], 0x1 \n" // src_stride x 2 + "sll $t4, %[src_stride], 0x2 \n" // src_stride x 4 + "sll $t9, %[src_stride], 0x3 \n" // src_stride x 8 + "addu $t3, $t2, %[src_stride] \n" + "addu $t5, $t4, %[src_stride] \n" + "addu $t6, $t2, $t4 \n" + "andi $t0, %[dst], 0x3 \n" + "andi $t1, %[dst_stride], 0x3 \n" + "or $t0, $t0, $t1 \n" + "bnez $t0, 11f \n" + " subu $t7, $t9, %[src_stride] \n" +//dst + dst_stride word aligned + "1: \n" + "lbu $t0, 0(%[src]) \n" + "lbux $t1, %[src_stride](%[src]) \n" + "lbux $t8, $t2(%[src]) \n" + "lbux $t9, $t3(%[src]) \n" + "sll $t1, $t1, 16 \n" + "sll $t9, $t9, 16 \n" + "or $t0, $t0, $t1 \n" + "or $t8, $t8, $t9 \n" + "precr.qb.ph $s0, $t8, $t0 \n" + "lbux $t0, $t4(%[src]) \n" + "lbux $t1, $t5(%[src]) \n" + "lbux $t8, $t6(%[src]) \n" + "lbux $t9, $t7(%[src]) \n" + "sll $t1, $t1, 16 \n" + "sll $t9, $t9, 16 \n" + "or $t0, $t0, $t1 \n" + "or $t8, $t8, $t9 \n" + "precr.qb.ph $s1, $t8, $t0 \n" + "sw $s0, 0(%[dst]) \n" + "addiu %[width], -1 \n" + "addiu %[src], 1 \n" + "sw $s1, 4(%[dst]) \n" + "bnez %[width], 1b \n" + " addu %[dst], %[dst], %[dst_stride] \n" + "b 2f \n" +//dst + dst_stride unaligned + "11: \n" + "lbu $t0, 0(%[src]) \n" + "lbux $t1, %[src_stride](%[src]) \n" + "lbux $t8, $t2(%[src]) \n" + "lbux $t9, $t3(%[src]) \n" + "sll $t1, $t1, 16 \n" + "sll $t9, $t9, 16 \n" + "or $t0, $t0, $t1 \n" + "or $t8, $t8, $t9 \n" + "precr.qb.ph $s0, $t8, $t0 \n" + "lbux $t0, $t4(%[src]) \n" + "lbux $t1, $t5(%[src]) \n" + "lbux $t8, $t6(%[src]) \n" + "lbux $t9, $t7(%[src]) \n" + "sll $t1, $t1, 16 \n" + "sll $t9, $t9, 16 \n" + "or $t0, $t0, $t1 \n" + "or $t8, $t8, $t9 \n" + "precr.qb.ph $s1, $t8, $t0 \n" + "swr $s0, 0(%[dst]) \n" + "swl $s0, 3(%[dst]) \n" + "addiu %[width], -1 \n" + "addiu %[src], 1 \n" + "swr $s1, 4(%[dst]) \n" + "swl $s1, 7(%[dst]) \n" + "bnez %[width], 11b \n" + "addu %[dst], %[dst], %[dst_stride] \n" + "2: \n" + ".set pop \n" + :[src] "+r" (src), + [dst] "+r" (dst), + [width] "+r" (width) + :[src_stride] "r" (src_stride), + [dst_stride] "r" (dst_stride) + : "t0", "t1", "t2", "t3", "t4", "t5", + "t6", "t7", "t8", "t9", + "s0", "s1" + ); +} + +void TransposeWx8_FAST_MIPS_DSPR2(const uint8* src, int src_stride, + uint8* dst, int dst_stride, + int width) { + __asm__ __volatile__ ( + ".set noat \n" + ".set push \n" + ".set noreorder \n" + "beqz %[width], 2f \n" + " sll $t2, %[src_stride], 0x1 \n" // src_stride x 2 + "sll $t4, %[src_stride], 0x2 \n" // src_stride x 4 + "sll $t9, %[src_stride], 0x3 \n" // src_stride x 8 + "addu $t3, $t2, %[src_stride] \n" + "addu $t5, $t4, %[src_stride] \n" + "addu $t6, $t2, $t4 \n" + + "srl $AT, %[width], 0x2 \n" + "andi $t0, %[dst], 0x3 \n" + "andi $t1, %[dst_stride], 0x3 \n" + "or $t0, $t0, $t1 \n" + "bnez $t0, 11f \n" + " subu $t7, $t9, %[src_stride] \n" +//dst + dst_stride word aligned + "1: \n" + "lw $t0, 0(%[src]) \n" + "lwx $t1, %[src_stride](%[src]) \n" + "lwx $t8, $t2(%[src]) \n" + "lwx $t9, $t3(%[src]) \n" + +// t0 = | 30 | 20 | 10 | 00 | +// t1 = | 31 | 21 | 11 | 01 | +// t8 = | 32 | 22 | 12 | 02 | +// t9 = | 33 | 23 | 13 | 03 | + + "precr.qb.ph $s0, $t1, $t0 \n" + "precr.qb.ph $s1, $t9, $t8 \n" + "precrq.qb.ph $s2, $t1, $t0 \n" + "precrq.qb.ph $s3, $t9, $t8 \n" + + // s0 = | 21 | 01 | 20 | 00 | + // s1 = | 23 | 03 | 22 | 02 | + // s2 = | 31 | 11 | 30 | 10 | + // s3 = | 33 | 13 | 32 | 12 | + + "precr.qb.ph $s4, $s1, $s0 \n" + "precrq.qb.ph $s5, $s1, $s0 \n" + "precr.qb.ph $s6, $s3, $s2 \n" + "precrq.qb.ph $s7, $s3, $s2 \n" + + // s4 = | 03 | 02 | 01 | 00 | + // s5 = | 23 | 22 | 21 | 20 | + // s6 = | 13 | 12 | 11 | 10 | + // s7 = | 33 | 32 | 31 | 30 | + + "lwx $t0, $t4(%[src]) \n" + "lwx $t1, $t5(%[src]) \n" + "lwx $t8, $t6(%[src]) \n" + "lwx $t9, $t7(%[src]) \n" + +// t0 = | 34 | 24 | 14 | 04 | +// t1 = | 35 | 25 | 15 | 05 | +// t8 = | 36 | 26 | 16 | 06 | +// t9 = | 37 | 27 | 17 | 07 | + + "precr.qb.ph $s0, $t1, $t0 \n" + "precr.qb.ph $s1, $t9, $t8 \n" + "precrq.qb.ph $s2, $t1, $t0 \n" + "precrq.qb.ph $s3, $t9, $t8 \n" + + // s0 = | 25 | 05 | 24 | 04 | + // s1 = | 27 | 07 | 26 | 06 | + // s2 = | 35 | 15 | 34 | 14 | + // s3 = | 37 | 17 | 36 | 16 | + + "precr.qb.ph $t0, $s1, $s0 \n" + "precrq.qb.ph $t1, $s1, $s0 \n" + "precr.qb.ph $t8, $s3, $s2 \n" + "precrq.qb.ph $t9, $s3, $s2 \n" + + // t0 = | 07 | 06 | 05 | 04 | + // t1 = | 27 | 26 | 25 | 24 | + // t8 = | 17 | 16 | 15 | 14 | + // t9 = | 37 | 36 | 35 | 34 | + + "addu $s0, %[dst], %[dst_stride] \n" + "addu $s1, $s0, %[dst_stride] \n" + "addu $s2, $s1, %[dst_stride] \n" + + "sw $s4, 0(%[dst]) \n" + "sw $t0, 4(%[dst]) \n" + "sw $s6, 0($s0) \n" + "sw $t8, 4($s0) \n" + "sw $s5, 0($s1) \n" + "sw $t1, 4($s1) \n" + "sw $s7, 0($s2) \n" + "sw $t9, 4($s2) \n" + + "addiu $AT, -1 \n" + "addiu %[src], 4 \n" + + "bnez $AT, 1b \n" + " addu %[dst], $s2, %[dst_stride] \n" + "b 2f \n" +//dst + dst_stride unaligned + "11: \n" + "lw $t0, 0(%[src]) \n" + "lwx $t1, %[src_stride](%[src]) \n" + "lwx $t8, $t2(%[src]) \n" + "lwx $t9, $t3(%[src]) \n" + +// t0 = | 30 | 20 | 10 | 00 | +// t1 = | 31 | 21 | 11 | 01 | +// t8 = | 32 | 22 | 12 | 02 | +// t9 = | 33 | 23 | 13 | 03 | + + "precr.qb.ph $s0, $t1, $t0 \n" + "precr.qb.ph $s1, $t9, $t8 \n" + "precrq.qb.ph $s2, $t1, $t0 \n" + "precrq.qb.ph $s3, $t9, $t8 \n" + + // s0 = | 21 | 01 | 20 | 00 | + // s1 = | 23 | 03 | 22 | 02 | + // s2 = | 31 | 11 | 30 | 10 | + // s3 = | 33 | 13 | 32 | 12 | + + "precr.qb.ph $s4, $s1, $s0 \n" + "precrq.qb.ph $s5, $s1, $s0 \n" + "precr.qb.ph $s6, $s3, $s2 \n" + "precrq.qb.ph $s7, $s3, $s2 \n" + + // s4 = | 03 | 02 | 01 | 00 | + // s5 = | 23 | 22 | 21 | 20 | + // s6 = | 13 | 12 | 11 | 10 | + // s7 = | 33 | 32 | 31 | 30 | + + "lwx $t0, $t4(%[src]) \n" + "lwx $t1, $t5(%[src]) \n" + "lwx $t8, $t6(%[src]) \n" + "lwx $t9, $t7(%[src]) \n" + +// t0 = | 34 | 24 | 14 | 04 | +// t1 = | 35 | 25 | 15 | 05 | +// t8 = | 36 | 26 | 16 | 06 | +// t9 = | 37 | 27 | 17 | 07 | + + "precr.qb.ph $s0, $t1, $t0 \n" + "precr.qb.ph $s1, $t9, $t8 \n" + "precrq.qb.ph $s2, $t1, $t0 \n" + "precrq.qb.ph $s3, $t9, $t8 \n" + + // s0 = | 25 | 05 | 24 | 04 | + // s1 = | 27 | 07 | 26 | 06 | + // s2 = | 35 | 15 | 34 | 14 | + // s3 = | 37 | 17 | 36 | 16 | + + "precr.qb.ph $t0, $s1, $s0 \n" + "precrq.qb.ph $t1, $s1, $s0 \n" + "precr.qb.ph $t8, $s3, $s2 \n" + "precrq.qb.ph $t9, $s3, $s2 \n" + + // t0 = | 07 | 06 | 05 | 04 | + // t1 = | 27 | 26 | 25 | 24 | + // t8 = | 17 | 16 | 15 | 14 | + // t9 = | 37 | 36 | 35 | 34 | + + "addu $s0, %[dst], %[dst_stride] \n" + "addu $s1, $s0, %[dst_stride] \n" + "addu $s2, $s1, %[dst_stride] \n" + + "swr $s4, 0(%[dst]) \n" + "swl $s4, 3(%[dst]) \n" + "swr $t0, 4(%[dst]) \n" + "swl $t0, 7(%[dst]) \n" + "swr $s6, 0($s0) \n" + "swl $s6, 3($s0) \n" + "swr $t8, 4($s0) \n" + "swl $t8, 7($s0) \n" + "swr $s5, 0($s1) \n" + "swl $s5, 3($s1) \n" + "swr $t1, 4($s1) \n" + "swl $t1, 7($s1) \n" + "swr $s7, 0($s2) \n" + "swl $s7, 3($s2) \n" + "swr $t9, 4($s2) \n" + "swl $t9, 7($s2) \n" + + "addiu $AT, -1 \n" + "addiu %[src], 4 \n" + + "bnez $AT, 11b \n" + " addu %[dst], $s2, %[dst_stride] \n" + "2: \n" + ".set pop \n" + ".set at \n" + :[src] "+r" (src), + [dst] "+r" (dst), + [width] "+r" (width) + :[src_stride] "r" (src_stride), + [dst_stride] "r" (dst_stride) + : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9", + "s0", "s1", "s2", "s3", "s4", "s5", "s6", "s7" + ); +} + +void TransposeUVWx8_MIPS_DSPR2(const uint8* src, int src_stride, + uint8* dst_a, int dst_stride_a, + uint8* dst_b, int dst_stride_b, + int width) { + __asm__ __volatile__ ( + ".set push \n" + ".set noreorder \n" + "beqz %[width], 2f \n" + " sll $t2, %[src_stride], 0x1 \n" // src_stride x 2 + "sll $t4, %[src_stride], 0x2 \n" // src_stride x 4 + "sll $t9, %[src_stride], 0x3 \n" // src_stride x 8 + "addu $t3, $t2, %[src_stride] \n" + "addu $t5, $t4, %[src_stride] \n" + "addu $t6, $t2, $t4 \n" + "subu $t7, $t9, %[src_stride] \n" + "srl $t1, %[width], 1 \n" + +// check word aligment for dst_a, dst_b, dst_stride_a and dst_stride_b + "andi $t0, %[dst_a], 0x3 \n" + "andi $t8, %[dst_b], 0x3 \n" + "or $t0, $t0, $t8 \n" + "andi $t8, %[dst_stride_a], 0x3 \n" + "andi $s5, %[dst_stride_b], 0x3 \n" + "or $t8, $t8, $s5 \n" + "or $t0, $t0, $t8 \n" + "bnez $t0, 11f \n" + " nop \n" +// dst + dst_stride word aligned (both, a & b dst addresses) + "1: \n" + "lw $t0, 0(%[src]) \n" // |B0|A0|b0|a0| + "lwx $t8, %[src_stride](%[src]) \n" // |B1|A1|b1|a1| + "addu $s5, %[dst_a], %[dst_stride_a] \n" + "lwx $t9, $t2(%[src]) \n" // |B2|A2|b2|a2| + "lwx $s0, $t3(%[src]) \n" // |B3|A3|b3|a3| + "addu $s6, %[dst_b], %[dst_stride_b] \n" + + "precrq.ph.w $s1, $t8, $t0 \n" // |B1|A1|B0|A0| + "precrq.ph.w $s2, $s0, $t9 \n" // |B3|A3|B2|A2| + "precr.qb.ph $s3, $s2, $s1 \n" // |A3|A2|A1|A0| + "precrq.qb.ph $s4, $s2, $s1 \n" // |B3|B2|B1|B0| + + "sll $t0, $t0, 16 \n" + "packrl.ph $s1, $t8, $t0 \n" // |b1|a1|b0|a0| + "sll $t9, $t9, 16 \n" + "packrl.ph $s2, $s0, $t9 \n" // |b3|a3|b2|a2| + + "sw $s3, 0($s5) \n" + "sw $s4, 0($s6) \n" + + "precr.qb.ph $s3, $s2, $s1 \n" // |a3|a2|a1|a0| + "precrq.qb.ph $s4, $s2, $s1 \n" // |b3|b2|b1|b0| + + "lwx $t0, $t4(%[src]) \n" // |B4|A4|b4|a4| + "lwx $t8, $t5(%[src]) \n" // |B5|A5|b5|a5| + "lwx $t9, $t6(%[src]) \n" // |B6|A6|b6|a6| + "lwx $s0, $t7(%[src]) \n" // |B7|A7|b7|a7| + "sw $s3, 0(%[dst_a]) \n" + "sw $s4, 0(%[dst_b]) \n" + + "precrq.ph.w $s1, $t8, $t0 \n" // |B5|A5|B4|A4| + "precrq.ph.w $s2, $s0, $t9 \n" // |B6|A6|B7|A7| + "precr.qb.ph $s3, $s2, $s1 \n" // |A7|A6|A5|A4| + "precrq.qb.ph $s4, $s2, $s1 \n" // |B7|B6|B5|B4| + + "sll $t0, $t0, 16 \n" + "packrl.ph $s1, $t8, $t0 \n" // |b5|a5|b4|a4| + "sll $t9, $t9, 16 \n" + "packrl.ph $s2, $s0, $t9 \n" // |b7|a7|b6|a6| + "sw $s3, 4($s5) \n" + "sw $s4, 4($s6) \n" + + "precr.qb.ph $s3, $s2, $s1 \n" // |a7|a6|a5|a4| + "precrq.qb.ph $s4, $s2, $s1 \n" // |b7|b6|b5|b4| + + "addiu %[src], 4 \n" + "addiu $t1, -1 \n" + "sll $t0, %[dst_stride_a], 1 \n" + "sll $t8, %[dst_stride_b], 1 \n" + "sw $s3, 4(%[dst_a]) \n" + "sw $s4, 4(%[dst_b]) \n" + "addu %[dst_a], %[dst_a], $t0 \n" + "bnez $t1, 1b \n" + " addu %[dst_b], %[dst_b], $t8 \n" + "b 2f \n" + " nop \n" + +// dst_a or dst_b or dst_stride_a or dst_stride_b not word aligned + "11: \n" + "lw $t0, 0(%[src]) \n" // |B0|A0|b0|a0| + "lwx $t8, %[src_stride](%[src]) \n" // |B1|A1|b1|a1| + "addu $s5, %[dst_a], %[dst_stride_a] \n" + "lwx $t9, $t2(%[src]) \n" // |B2|A2|b2|a2| + "lwx $s0, $t3(%[src]) \n" // |B3|A3|b3|a3| + "addu $s6, %[dst_b], %[dst_stride_b] \n" + + "precrq.ph.w $s1, $t8, $t0 \n" // |B1|A1|B0|A0| + "precrq.ph.w $s2, $s0, $t9 \n" // |B3|A3|B2|A2| + "precr.qb.ph $s3, $s2, $s1 \n" // |A3|A2|A1|A0| + "precrq.qb.ph $s4, $s2, $s1 \n" // |B3|B2|B1|B0| + + "sll $t0, $t0, 16 \n" + "packrl.ph $s1, $t8, $t0 \n" // |b1|a1|b0|a0| + "sll $t9, $t9, 16 \n" + "packrl.ph $s2, $s0, $t9 \n" // |b3|a3|b2|a2| + + "swr $s3, 0($s5) \n" + "swl $s3, 3($s5) \n" + "swr $s4, 0($s6) \n" + "swl $s4, 3($s6) \n" + + "precr.qb.ph $s3, $s2, $s1 \n" // |a3|a2|a1|a0| + "precrq.qb.ph $s4, $s2, $s1 \n" // |b3|b2|b1|b0| + + "lwx $t0, $t4(%[src]) \n" // |B4|A4|b4|a4| + "lwx $t8, $t5(%[src]) \n" // |B5|A5|b5|a5| + "lwx $t9, $t6(%[src]) \n" // |B6|A6|b6|a6| + "lwx $s0, $t7(%[src]) \n" // |B7|A7|b7|a7| + "swr $s3, 0(%[dst_a]) \n" + "swl $s3, 3(%[dst_a]) \n" + "swr $s4, 0(%[dst_b]) \n" + "swl $s4, 3(%[dst_b]) \n" + + "precrq.ph.w $s1, $t8, $t0 \n" // |B5|A5|B4|A4| + "precrq.ph.w $s2, $s0, $t9 \n" // |B6|A6|B7|A7| + "precr.qb.ph $s3, $s2, $s1 \n" // |A7|A6|A5|A4| + "precrq.qb.ph $s4, $s2, $s1 \n" // |B7|B6|B5|B4| + + "sll $t0, $t0, 16 \n" + "packrl.ph $s1, $t8, $t0 \n" // |b5|a5|b4|a4| + "sll $t9, $t9, 16 \n" + "packrl.ph $s2, $s0, $t9 \n" // |b7|a7|b6|a6| + + "swr $s3, 4($s5) \n" + "swl $s3, 7($s5) \n" + "swr $s4, 4($s6) \n" + "swl $s4, 7($s6) \n" + + "precr.qb.ph $s3, $s2, $s1 \n" // |a7|a6|a5|a4| + "precrq.qb.ph $s4, $s2, $s1 \n" // |b7|b6|b5|b4| + + "addiu %[src], 4 \n" + "addiu $t1, -1 \n" + "sll $t0, %[dst_stride_a], 1 \n" + "sll $t8, %[dst_stride_b], 1 \n" + "swr $s3, 4(%[dst_a]) \n" + "swl $s3, 7(%[dst_a]) \n" + "swr $s4, 4(%[dst_b]) \n" + "swl $s4, 7(%[dst_b]) \n" + "addu %[dst_a], %[dst_a], $t0 \n" + "bnez $t1, 11b \n" + " addu %[dst_b], %[dst_b], $t8 \n" + + "2: \n" + ".set pop \n" + : [src] "+r" (src), + [dst_a] "+r" (dst_a), + [dst_b] "+r" (dst_b), + [width] "+r" (width), + [src_stride] "+r" (src_stride) + : [dst_stride_a] "r" (dst_stride_a), + [dst_stride_b] "r" (dst_stride_b) + : "t0", "t1", "t2", "t3", "t4", "t5", + "t6", "t7", "t8", "t9", + "s0", "s1", "s2", "s3", + "s4", "s5", "s6" + ); +} + +#endif // defined(__mips_dsp) && (__mips_dsp_rev >= 2) + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/TMessagesProj/jni/libyuv/source/rotate_neon.cc b/TMessagesProj/jni/libyuv/source/rotate_neon.cc new file mode 100644 index 000000000..d354e11fa --- /dev/null +++ b/TMessagesProj/jni/libyuv/source/rotate_neon.cc @@ -0,0 +1,533 @@ +/* + * Copyright 2011 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/row.h" + +#include "libyuv/basic_types.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) + +static uvec8 kVTbl4x4Transpose = + { 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 }; + +void TransposeWx8_NEON(const uint8* src, int src_stride, + uint8* dst, int dst_stride, + int width) { + const uint8* src_temp = NULL; + asm volatile ( + // loops are on blocks of 8. loop will stop when + // counter gets to or below 0. starting the counter + // at w-8 allow for this + "sub %5, #8 \n" + + // handle 8x8 blocks. this should be the majority of the plane + ".p2align 2 \n" + "1: \n" + "mov %0, %1 \n" + + MEMACCESS(0) + "vld1.8 {d0}, [%0], %2 \n" + MEMACCESS(0) + "vld1.8 {d1}, [%0], %2 \n" + MEMACCESS(0) + "vld1.8 {d2}, [%0], %2 \n" + MEMACCESS(0) + "vld1.8 {d3}, [%0], %2 \n" + MEMACCESS(0) + "vld1.8 {d4}, [%0], %2 \n" + MEMACCESS(0) + "vld1.8 {d5}, [%0], %2 \n" + MEMACCESS(0) + "vld1.8 {d6}, [%0], %2 \n" + MEMACCESS(0) + "vld1.8 {d7}, [%0] \n" + + "vtrn.8 d1, d0 \n" + "vtrn.8 d3, d2 \n" + "vtrn.8 d5, d4 \n" + "vtrn.8 d7, d6 \n" + + "vtrn.16 d1, d3 \n" + "vtrn.16 d0, d2 \n" + "vtrn.16 d5, d7 \n" + "vtrn.16 d4, d6 \n" + + "vtrn.32 d1, d5 \n" + "vtrn.32 d0, d4 \n" + "vtrn.32 d3, d7 \n" + "vtrn.32 d2, d6 \n" + + "vrev16.8 q0, q0 \n" + "vrev16.8 q1, q1 \n" + "vrev16.8 q2, q2 \n" + "vrev16.8 q3, q3 \n" + + "mov %0, %3 \n" + + MEMACCESS(0) + "vst1.8 {d1}, [%0], %4 \n" + MEMACCESS(0) + "vst1.8 {d0}, [%0], %4 \n" + MEMACCESS(0) + "vst1.8 {d3}, [%0], %4 \n" + MEMACCESS(0) + "vst1.8 {d2}, [%0], %4 \n" + MEMACCESS(0) + "vst1.8 {d5}, [%0], %4 \n" + MEMACCESS(0) + "vst1.8 {d4}, [%0], %4 \n" + MEMACCESS(0) + "vst1.8 {d7}, [%0], %4 \n" + MEMACCESS(0) + "vst1.8 {d6}, [%0] \n" + + "add %1, #8 \n" // src += 8 + "add %3, %3, %4, lsl #3 \n" // dst += 8 * dst_stride + "subs %5, #8 \n" // w -= 8 + "bge 1b \n" + + // add 8 back to counter. if the result is 0 there are + // no residuals. + "adds %5, #8 \n" + "beq 4f \n" + + // some residual, so between 1 and 7 lines left to transpose + "cmp %5, #2 \n" + "blt 3f \n" + + "cmp %5, #4 \n" + "blt 2f \n" + + // 4x8 block + "mov %0, %1 \n" + MEMACCESS(0) + "vld1.32 {d0[0]}, [%0], %2 \n" + MEMACCESS(0) + "vld1.32 {d0[1]}, [%0], %2 \n" + MEMACCESS(0) + "vld1.32 {d1[0]}, [%0], %2 \n" + MEMACCESS(0) + "vld1.32 {d1[1]}, [%0], %2 \n" + MEMACCESS(0) + "vld1.32 {d2[0]}, [%0], %2 \n" + MEMACCESS(0) + "vld1.32 {d2[1]}, [%0], %2 \n" + MEMACCESS(0) + "vld1.32 {d3[0]}, [%0], %2 \n" + MEMACCESS(0) + "vld1.32 {d3[1]}, [%0] \n" + + "mov %0, %3 \n" + + MEMACCESS(6) + "vld1.8 {q3}, [%6] \n" + + "vtbl.8 d4, {d0, d1}, d6 \n" + "vtbl.8 d5, {d0, d1}, d7 \n" + "vtbl.8 d0, {d2, d3}, d6 \n" + "vtbl.8 d1, {d2, d3}, d7 \n" + + // TODO(frkoenig): Rework shuffle above to + // write out with 4 instead of 8 writes. + MEMACCESS(0) + "vst1.32 {d4[0]}, [%0], %4 \n" + MEMACCESS(0) + "vst1.32 {d4[1]}, [%0], %4 \n" + MEMACCESS(0) + "vst1.32 {d5[0]}, [%0], %4 \n" + MEMACCESS(0) + "vst1.32 {d5[1]}, [%0] \n" + + "add %0, %3, #4 \n" + MEMACCESS(0) + "vst1.32 {d0[0]}, [%0], %4 \n" + MEMACCESS(0) + "vst1.32 {d0[1]}, [%0], %4 \n" + MEMACCESS(0) + "vst1.32 {d1[0]}, [%0], %4 \n" + MEMACCESS(0) + "vst1.32 {d1[1]}, [%0] \n" + + "add %1, #4 \n" // src += 4 + "add %3, %3, %4, lsl #2 \n" // dst += 4 * dst_stride + "subs %5, #4 \n" // w -= 4 + "beq 4f \n" + + // some residual, check to see if it includes a 2x8 block, + // or less + "cmp %5, #2 \n" + "blt 3f \n" + + // 2x8 block + "2: \n" + "mov %0, %1 \n" + MEMACCESS(0) + "vld1.16 {d0[0]}, [%0], %2 \n" + MEMACCESS(0) + "vld1.16 {d1[0]}, [%0], %2 \n" + MEMACCESS(0) + "vld1.16 {d0[1]}, [%0], %2 \n" + MEMACCESS(0) + "vld1.16 {d1[1]}, [%0], %2 \n" + MEMACCESS(0) + "vld1.16 {d0[2]}, [%0], %2 \n" + MEMACCESS(0) + "vld1.16 {d1[2]}, [%0], %2 \n" + MEMACCESS(0) + "vld1.16 {d0[3]}, [%0], %2 \n" + MEMACCESS(0) + "vld1.16 {d1[3]}, [%0] \n" + + "vtrn.8 d0, d1 \n" + + "mov %0, %3 \n" + + MEMACCESS(0) + "vst1.64 {d0}, [%0], %4 \n" + MEMACCESS(0) + "vst1.64 {d1}, [%0] \n" + + "add %1, #2 \n" // src += 2 + "add %3, %3, %4, lsl #1 \n" // dst += 2 * dst_stride + "subs %5, #2 \n" // w -= 2 + "beq 4f \n" + + // 1x8 block + "3: \n" + MEMACCESS(1) + "vld1.8 {d0[0]}, [%1], %2 \n" + MEMACCESS(1) + "vld1.8 {d0[1]}, [%1], %2 \n" + MEMACCESS(1) + "vld1.8 {d0[2]}, [%1], %2 \n" + MEMACCESS(1) + "vld1.8 {d0[3]}, [%1], %2 \n" + MEMACCESS(1) + "vld1.8 {d0[4]}, [%1], %2 \n" + MEMACCESS(1) + "vld1.8 {d0[5]}, [%1], %2 \n" + MEMACCESS(1) + "vld1.8 {d0[6]}, [%1], %2 \n" + MEMACCESS(1) + "vld1.8 {d0[7]}, [%1] \n" + + MEMACCESS(3) + "vst1.64 {d0}, [%3] \n" + + "4: \n" + + : "+r"(src_temp), // %0 + "+r"(src), // %1 + "+r"(src_stride), // %2 + "+r"(dst), // %3 + "+r"(dst_stride), // %4 + "+r"(width) // %5 + : "r"(&kVTbl4x4Transpose) // %6 + : "memory", "cc", "q0", "q1", "q2", "q3" + ); +} + +static uvec8 kVTbl4x4TransposeDi = + { 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15 }; + +void TransposeUVWx8_NEON(const uint8* src, int src_stride, + uint8* dst_a, int dst_stride_a, + uint8* dst_b, int dst_stride_b, + int width) { + const uint8* src_temp = NULL; + asm volatile ( + // loops are on blocks of 8. loop will stop when + // counter gets to or below 0. starting the counter + // at w-8 allow for this + "sub %7, #8 \n" + + // handle 8x8 blocks. this should be the majority of the plane + ".p2align 2 \n" + "1: \n" + "mov %0, %1 \n" + + MEMACCESS(0) + "vld2.8 {d0, d1}, [%0], %2 \n" + MEMACCESS(0) + "vld2.8 {d2, d3}, [%0], %2 \n" + MEMACCESS(0) + "vld2.8 {d4, d5}, [%0], %2 \n" + MEMACCESS(0) + "vld2.8 {d6, d7}, [%0], %2 \n" + MEMACCESS(0) + "vld2.8 {d16, d17}, [%0], %2 \n" + MEMACCESS(0) + "vld2.8 {d18, d19}, [%0], %2 \n" + MEMACCESS(0) + "vld2.8 {d20, d21}, [%0], %2 \n" + MEMACCESS(0) + "vld2.8 {d22, d23}, [%0] \n" + + "vtrn.8 q1, q0 \n" + "vtrn.8 q3, q2 \n" + "vtrn.8 q9, q8 \n" + "vtrn.8 q11, q10 \n" + + "vtrn.16 q1, q3 \n" + "vtrn.16 q0, q2 \n" + "vtrn.16 q9, q11 \n" + "vtrn.16 q8, q10 \n" + + "vtrn.32 q1, q9 \n" + "vtrn.32 q0, q8 \n" + "vtrn.32 q3, q11 \n" + "vtrn.32 q2, q10 \n" + + "vrev16.8 q0, q0 \n" + "vrev16.8 q1, q1 \n" + "vrev16.8 q2, q2 \n" + "vrev16.8 q3, q3 \n" + "vrev16.8 q8, q8 \n" + "vrev16.8 q9, q9 \n" + "vrev16.8 q10, q10 \n" + "vrev16.8 q11, q11 \n" + + "mov %0, %3 \n" + + MEMACCESS(0) + "vst1.8 {d2}, [%0], %4 \n" + MEMACCESS(0) + "vst1.8 {d0}, [%0], %4 \n" + MEMACCESS(0) + "vst1.8 {d6}, [%0], %4 \n" + MEMACCESS(0) + "vst1.8 {d4}, [%0], %4 \n" + MEMACCESS(0) + "vst1.8 {d18}, [%0], %4 \n" + MEMACCESS(0) + "vst1.8 {d16}, [%0], %4 \n" + MEMACCESS(0) + "vst1.8 {d22}, [%0], %4 \n" + MEMACCESS(0) + "vst1.8 {d20}, [%0] \n" + + "mov %0, %5 \n" + + MEMACCESS(0) + "vst1.8 {d3}, [%0], %6 \n" + MEMACCESS(0) + "vst1.8 {d1}, [%0], %6 \n" + MEMACCESS(0) + "vst1.8 {d7}, [%0], %6 \n" + MEMACCESS(0) + "vst1.8 {d5}, [%0], %6 \n" + MEMACCESS(0) + "vst1.8 {d19}, [%0], %6 \n" + MEMACCESS(0) + "vst1.8 {d17}, [%0], %6 \n" + MEMACCESS(0) + "vst1.8 {d23}, [%0], %6 \n" + MEMACCESS(0) + "vst1.8 {d21}, [%0] \n" + + "add %1, #8*2 \n" // src += 8*2 + "add %3, %3, %4, lsl #3 \n" // dst_a += 8 * dst_stride_a + "add %5, %5, %6, lsl #3 \n" // dst_b += 8 * dst_stride_b + "subs %7, #8 \n" // w -= 8 + "bge 1b \n" + + // add 8 back to counter. if the result is 0 there are + // no residuals. + "adds %7, #8 \n" + "beq 4f \n" + + // some residual, so between 1 and 7 lines left to transpose + "cmp %7, #2 \n" + "blt 3f \n" + + "cmp %7, #4 \n" + "blt 2f \n" + + // TODO(frkoenig): Clean this up + // 4x8 block + "mov %0, %1 \n" + MEMACCESS(0) + "vld1.64 {d0}, [%0], %2 \n" + MEMACCESS(0) + "vld1.64 {d1}, [%0], %2 \n" + MEMACCESS(0) + "vld1.64 {d2}, [%0], %2 \n" + MEMACCESS(0) + "vld1.64 {d3}, [%0], %2 \n" + MEMACCESS(0) + "vld1.64 {d4}, [%0], %2 \n" + MEMACCESS(0) + "vld1.64 {d5}, [%0], %2 \n" + MEMACCESS(0) + "vld1.64 {d6}, [%0], %2 \n" + MEMACCESS(0) + "vld1.64 {d7}, [%0] \n" + + MEMACCESS(8) + "vld1.8 {q15}, [%8] \n" + + "vtrn.8 q0, q1 \n" + "vtrn.8 q2, q3 \n" + + "vtbl.8 d16, {d0, d1}, d30 \n" + "vtbl.8 d17, {d0, d1}, d31 \n" + "vtbl.8 d18, {d2, d3}, d30 \n" + "vtbl.8 d19, {d2, d3}, d31 \n" + "vtbl.8 d20, {d4, d5}, d30 \n" + "vtbl.8 d21, {d4, d5}, d31 \n" + "vtbl.8 d22, {d6, d7}, d30 \n" + "vtbl.8 d23, {d6, d7}, d31 \n" + + "mov %0, %3 \n" + + MEMACCESS(0) + "vst1.32 {d16[0]}, [%0], %4 \n" + MEMACCESS(0) + "vst1.32 {d16[1]}, [%0], %4 \n" + MEMACCESS(0) + "vst1.32 {d17[0]}, [%0], %4 \n" + MEMACCESS(0) + "vst1.32 {d17[1]}, [%0], %4 \n" + + "add %0, %3, #4 \n" + MEMACCESS(0) + "vst1.32 {d20[0]}, [%0], %4 \n" + MEMACCESS(0) + "vst1.32 {d20[1]}, [%0], %4 \n" + MEMACCESS(0) + "vst1.32 {d21[0]}, [%0], %4 \n" + MEMACCESS(0) + "vst1.32 {d21[1]}, [%0] \n" + + "mov %0, %5 \n" + + MEMACCESS(0) + "vst1.32 {d18[0]}, [%0], %6 \n" + MEMACCESS(0) + "vst1.32 {d18[1]}, [%0], %6 \n" + MEMACCESS(0) + "vst1.32 {d19[0]}, [%0], %6 \n" + MEMACCESS(0) + "vst1.32 {d19[1]}, [%0], %6 \n" + + "add %0, %5, #4 \n" + MEMACCESS(0) + "vst1.32 {d22[0]}, [%0], %6 \n" + MEMACCESS(0) + "vst1.32 {d22[1]}, [%0], %6 \n" + MEMACCESS(0) + "vst1.32 {d23[0]}, [%0], %6 \n" + MEMACCESS(0) + "vst1.32 {d23[1]}, [%0] \n" + + "add %1, #4*2 \n" // src += 4 * 2 + "add %3, %3, %4, lsl #2 \n" // dst_a += 4 * dst_stride_a + "add %5, %5, %6, lsl #2 \n" // dst_b += 4 * dst_stride_b + "subs %7, #4 \n" // w -= 4 + "beq 4f \n" + + // some residual, check to see if it includes a 2x8 block, + // or less + "cmp %7, #2 \n" + "blt 3f \n" + + // 2x8 block + "2: \n" + "mov %0, %1 \n" + MEMACCESS(0) + "vld2.16 {d0[0], d2[0]}, [%0], %2 \n" + MEMACCESS(0) + "vld2.16 {d1[0], d3[0]}, [%0], %2 \n" + MEMACCESS(0) + "vld2.16 {d0[1], d2[1]}, [%0], %2 \n" + MEMACCESS(0) + "vld2.16 {d1[1], d3[1]}, [%0], %2 \n" + MEMACCESS(0) + "vld2.16 {d0[2], d2[2]}, [%0], %2 \n" + MEMACCESS(0) + "vld2.16 {d1[2], d3[2]}, [%0], %2 \n" + MEMACCESS(0) + "vld2.16 {d0[3], d2[3]}, [%0], %2 \n" + MEMACCESS(0) + "vld2.16 {d1[3], d3[3]}, [%0] \n" + + "vtrn.8 d0, d1 \n" + "vtrn.8 d2, d3 \n" + + "mov %0, %3 \n" + + MEMACCESS(0) + "vst1.64 {d0}, [%0], %4 \n" + MEMACCESS(0) + "vst1.64 {d2}, [%0] \n" + + "mov %0, %5 \n" + + MEMACCESS(0) + "vst1.64 {d1}, [%0], %6 \n" + MEMACCESS(0) + "vst1.64 {d3}, [%0] \n" + + "add %1, #2*2 \n" // src += 2 * 2 + "add %3, %3, %4, lsl #1 \n" // dst_a += 2 * dst_stride_a + "add %5, %5, %6, lsl #1 \n" // dst_b += 2 * dst_stride_b + "subs %7, #2 \n" // w -= 2 + "beq 4f \n" + + // 1x8 block + "3: \n" + MEMACCESS(1) + "vld2.8 {d0[0], d1[0]}, [%1], %2 \n" + MEMACCESS(1) + "vld2.8 {d0[1], d1[1]}, [%1], %2 \n" + MEMACCESS(1) + "vld2.8 {d0[2], d1[2]}, [%1], %2 \n" + MEMACCESS(1) + "vld2.8 {d0[3], d1[3]}, [%1], %2 \n" + MEMACCESS(1) + "vld2.8 {d0[4], d1[4]}, [%1], %2 \n" + MEMACCESS(1) + "vld2.8 {d0[5], d1[5]}, [%1], %2 \n" + MEMACCESS(1) + "vld2.8 {d0[6], d1[6]}, [%1], %2 \n" + MEMACCESS(1) + "vld2.8 {d0[7], d1[7]}, [%1] \n" + + MEMACCESS(3) + "vst1.64 {d0}, [%3] \n" + MEMACCESS(5) + "vst1.64 {d1}, [%5] \n" + + "4: \n" + + : "+r"(src_temp), // %0 + "+r"(src), // %1 + "+r"(src_stride), // %2 + "+r"(dst_a), // %3 + "+r"(dst_stride_a), // %4 + "+r"(dst_b), // %5 + "+r"(dst_stride_b), // %6 + "+r"(width) // %7 + : "r"(&kVTbl4x4TransposeDi) // %8 + : "memory", "cc", + "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11" + ); +} +#endif + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/TMessagesProj/jni/libyuv/source/rotate_neon64.cc b/TMessagesProj/jni/libyuv/source/rotate_neon64.cc new file mode 100644 index 000000000..b080a2c6a --- /dev/null +++ b/TMessagesProj/jni/libyuv/source/rotate_neon64.cc @@ -0,0 +1,540 @@ +/* + * Copyright 2014 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/row.h" + +#include "libyuv/basic_types.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) +//this ifdef should be removed if TransposeWx8_NEON's aarch64 has +//been done +#ifdef HAS_TRANSPOSE_WX8_NEON +static uvec8 kVTbl4x4Transpose = + { 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 }; + +void TransposeWx8_NEON(const uint8* src, int src_stride, + uint8* dst, int dst_stride, + int width) { + const uint8* src_temp = NULL; + asm volatile ( + // loops are on blocks of 8. loop will stop when + // counter gets to or below 0. starting the counter + // at w-8 allow for this + "sub %5, #8 \n" + + // handle 8x8 blocks. this should be the majority of the plane + ".p2align 2 \n" + "1: \n" + "mov %0, %1 \n" + + MEMACCESS(0) + "vld1.8 {d0}, [%0], %2 \n" + MEMACCESS(0) + "vld1.8 {d1}, [%0], %2 \n" + MEMACCESS(0) + "vld1.8 {d2}, [%0], %2 \n" + MEMACCESS(0) + "vld1.8 {d3}, [%0], %2 \n" + MEMACCESS(0) + "vld1.8 {d4}, [%0], %2 \n" + MEMACCESS(0) + "vld1.8 {d5}, [%0], %2 \n" + MEMACCESS(0) + "vld1.8 {d6}, [%0], %2 \n" + MEMACCESS(0) + "vld1.8 {d7}, [%0] \n" + + "vtrn.8 d1, d0 \n" + "vtrn.8 d3, d2 \n" + "vtrn.8 d5, d4 \n" + "vtrn.8 d7, d6 \n" + + "vtrn.16 d1, d3 \n" + "vtrn.16 d0, d2 \n" + "vtrn.16 d5, d7 \n" + "vtrn.16 d4, d6 \n" + + "vtrn.32 d1, d5 \n" + "vtrn.32 d0, d4 \n" + "vtrn.32 d3, d7 \n" + "vtrn.32 d2, d6 \n" + + "vrev16.8 q0, q0 \n" + "vrev16.8 q1, q1 \n" + "vrev16.8 q2, q2 \n" + "vrev16.8 q3, q3 \n" + + "mov %0, %3 \n" + + MEMACCESS(0) + "vst1.8 {d1}, [%0], %4 \n" + MEMACCESS(0) + "vst1.8 {d0}, [%0], %4 \n" + MEMACCESS(0) + "vst1.8 {d3}, [%0], %4 \n" + MEMACCESS(0) + "vst1.8 {d2}, [%0], %4 \n" + MEMACCESS(0) + "vst1.8 {d5}, [%0], %4 \n" + MEMACCESS(0) + "vst1.8 {d4}, [%0], %4 \n" + MEMACCESS(0) + "vst1.8 {d7}, [%0], %4 \n" + MEMACCESS(0) + "vst1.8 {d6}, [%0] \n" + + "add %1, #8 \n" // src += 8 + "add %3, %3, %4, lsl #3 \n" // dst += 8 * dst_stride + "subs %5, #8 \n" // w -= 8 + "bge 1b \n" + + // add 8 back to counter. if the result is 0 there are + // no residuals. + "adds %5, #8 \n" + "beq 4f \n" + + // some residual, so between 1 and 7 lines left to transpose + "cmp %5, #2 \n" + "blt 3f \n" + + "cmp %5, #4 \n" + "blt 2f \n" + + // 4x8 block + "mov %0, %1 \n" + MEMACCESS(0) + "vld1.32 {d0[0]}, [%0], %2 \n" + MEMACCESS(0) + "vld1.32 {d0[1]}, [%0], %2 \n" + MEMACCESS(0) + "vld1.32 {d1[0]}, [%0], %2 \n" + MEMACCESS(0) + "vld1.32 {d1[1]}, [%0], %2 \n" + MEMACCESS(0) + "vld1.32 {d2[0]}, [%0], %2 \n" + MEMACCESS(0) + "vld1.32 {d2[1]}, [%0], %2 \n" + MEMACCESS(0) + "vld1.32 {d3[0]}, [%0], %2 \n" + MEMACCESS(0) + "vld1.32 {d3[1]}, [%0] \n" + + "mov %0, %3 \n" + + MEMACCESS(6) + "vld1.8 {q3}, [%6] \n" + + "vtbl.8 d4, {d0, d1}, d6 \n" + "vtbl.8 d5, {d0, d1}, d7 \n" + "vtbl.8 d0, {d2, d3}, d6 \n" + "vtbl.8 d1, {d2, d3}, d7 \n" + + // TODO(frkoenig): Rework shuffle above to + // write out with 4 instead of 8 writes. + MEMACCESS(0) + "vst1.32 {d4[0]}, [%0], %4 \n" + MEMACCESS(0) + "vst1.32 {d4[1]}, [%0], %4 \n" + MEMACCESS(0) + "vst1.32 {d5[0]}, [%0], %4 \n" + MEMACCESS(0) + "vst1.32 {d5[1]}, [%0] \n" + + "add %0, %3, #4 \n" + MEMACCESS(0) + "vst1.32 {d0[0]}, [%0], %4 \n" + MEMACCESS(0) + "vst1.32 {d0[1]}, [%0], %4 \n" + MEMACCESS(0) + "vst1.32 {d1[0]}, [%0], %4 \n" + MEMACCESS(0) + "vst1.32 {d1[1]}, [%0] \n" + + "add %1, #4 \n" // src += 4 + "add %3, %3, %4, lsl #2 \n" // dst += 4 * dst_stride + "subs %5, #4 \n" // w -= 4 + "beq 4f \n" + + // some residual, check to see if it includes a 2x8 block, + // or less + "cmp %5, #2 \n" + "blt 3f \n" + + // 2x8 block + "2: \n" + "mov %0, %1 \n" + MEMACCESS(0) + "vld1.16 {d0[0]}, [%0], %2 \n" + MEMACCESS(0) + "vld1.16 {d1[0]}, [%0], %2 \n" + MEMACCESS(0) + "vld1.16 {d0[1]}, [%0], %2 \n" + MEMACCESS(0) + "vld1.16 {d1[1]}, [%0], %2 \n" + MEMACCESS(0) + "vld1.16 {d0[2]}, [%0], %2 \n" + MEMACCESS(0) + "vld1.16 {d1[2]}, [%0], %2 \n" + MEMACCESS(0) + "vld1.16 {d0[3]}, [%0], %2 \n" + MEMACCESS(0) + "vld1.16 {d1[3]}, [%0] \n" + + "vtrn.8 d0, d1 \n" + + "mov %0, %3 \n" + + MEMACCESS(0) + "vst1.64 {d0}, [%0], %4 \n" + MEMACCESS(0) + "vst1.64 {d1}, [%0] \n" + + "add %1, #2 \n" // src += 2 + "add %3, %3, %4, lsl #1 \n" // dst += 2 * dst_stride + "subs %5, #2 \n" // w -= 2 + "beq 4f \n" + + // 1x8 block + "3: \n" + MEMACCESS(1) + "vld1.8 {d0[0]}, [%1], %2 \n" + MEMACCESS(1) + "vld1.8 {d0[1]}, [%1], %2 \n" + MEMACCESS(1) + "vld1.8 {d0[2]}, [%1], %2 \n" + MEMACCESS(1) + "vld1.8 {d0[3]}, [%1], %2 \n" + MEMACCESS(1) + "vld1.8 {d0[4]}, [%1], %2 \n" + MEMACCESS(1) + "vld1.8 {d0[5]}, [%1], %2 \n" + MEMACCESS(1) + "vld1.8 {d0[6]}, [%1], %2 \n" + MEMACCESS(1) + "vld1.8 {d0[7]}, [%1] \n" + + MEMACCESS(3) + "vst1.64 {d0}, [%3] \n" + + "4: \n" + + : "+r"(src_temp), // %0 + "+r"(src), // %1 + "+r"(src_stride), // %2 + "+r"(dst), // %3 + "+r"(dst_stride), // %4 + "+r"(width) // %5 + : "r"(&kVTbl4x4Transpose) // %6 + : "memory", "cc", "q0", "q1", "q2", "q3" + ); +} +#endif //HAS_TRANSPOSE_WX8_NEON + +//this ifdef should be removed if TransposeUVWx8_NEON's aarch64 has +//been done +#ifdef HAS_TRANSPOSE_UVWX8_NEON +static uvec8 kVTbl4x4TransposeDi = + { 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15 }; + +void TransposeUVWx8_NEON(const uint8* src, int src_stride, + uint8* dst_a, int dst_stride_a, + uint8* dst_b, int dst_stride_b, + int width) { + const uint8* src_temp = NULL; + asm volatile ( + // loops are on blocks of 8. loop will stop when + // counter gets to or below 0. starting the counter + // at w-8 allow for this + "sub %7, #8 \n" + + // handle 8x8 blocks. this should be the majority of the plane + ".p2align 2 \n" + "1: \n" + "mov %0, %1 \n" + + MEMACCESS(0) + "vld2.8 {d0, d1}, [%0], %2 \n" + MEMACCESS(0) + "vld2.8 {d2, d3}, [%0], %2 \n" + MEMACCESS(0) + "vld2.8 {d4, d5}, [%0], %2 \n" + MEMACCESS(0) + "vld2.8 {d6, d7}, [%0], %2 \n" + MEMACCESS(0) + "vld2.8 {d16, d17}, [%0], %2 \n" + MEMACCESS(0) + "vld2.8 {d18, d19}, [%0], %2 \n" + MEMACCESS(0) + "vld2.8 {d20, d21}, [%0], %2 \n" + MEMACCESS(0) + "vld2.8 {d22, d23}, [%0] \n" + + "vtrn.8 q1, q0 \n" + "vtrn.8 q3, q2 \n" + "vtrn.8 q9, q8 \n" + "vtrn.8 q11, q10 \n" + + "vtrn.16 q1, q3 \n" + "vtrn.16 q0, q2 \n" + "vtrn.16 q9, q11 \n" + "vtrn.16 q8, q10 \n" + + "vtrn.32 q1, q9 \n" + "vtrn.32 q0, q8 \n" + "vtrn.32 q3, q11 \n" + "vtrn.32 q2, q10 \n" + + "vrev16.8 q0, q0 \n" + "vrev16.8 q1, q1 \n" + "vrev16.8 q2, q2 \n" + "vrev16.8 q3, q3 \n" + "vrev16.8 q8, q8 \n" + "vrev16.8 q9, q9 \n" + "vrev16.8 q10, q10 \n" + "vrev16.8 q11, q11 \n" + + "mov %0, %3 \n" + + MEMACCESS(0) + "vst1.8 {d2}, [%0], %4 \n" + MEMACCESS(0) + "vst1.8 {d0}, [%0], %4 \n" + MEMACCESS(0) + "vst1.8 {d6}, [%0], %4 \n" + MEMACCESS(0) + "vst1.8 {d4}, [%0], %4 \n" + MEMACCESS(0) + "vst1.8 {d18}, [%0], %4 \n" + MEMACCESS(0) + "vst1.8 {d16}, [%0], %4 \n" + MEMACCESS(0) + "vst1.8 {d22}, [%0], %4 \n" + MEMACCESS(0) + "vst1.8 {d20}, [%0] \n" + + "mov %0, %5 \n" + + MEMACCESS(0) + "vst1.8 {d3}, [%0], %6 \n" + MEMACCESS(0) + "vst1.8 {d1}, [%0], %6 \n" + MEMACCESS(0) + "vst1.8 {d7}, [%0], %6 \n" + MEMACCESS(0) + "vst1.8 {d5}, [%0], %6 \n" + MEMACCESS(0) + "vst1.8 {d19}, [%0], %6 \n" + MEMACCESS(0) + "vst1.8 {d17}, [%0], %6 \n" + MEMACCESS(0) + "vst1.8 {d23}, [%0], %6 \n" + MEMACCESS(0) + "vst1.8 {d21}, [%0] \n" + + "add %1, #8*2 \n" // src += 8*2 + "add %3, %3, %4, lsl #3 \n" // dst_a += 8 * dst_stride_a + "add %5, %5, %6, lsl #3 \n" // dst_b += 8 * dst_stride_b + "subs %7, #8 \n" // w -= 8 + "bge 1b \n" + + // add 8 back to counter. if the result is 0 there are + // no residuals. + "adds %7, #8 \n" + "beq 4f \n" + + // some residual, so between 1 and 7 lines left to transpose + "cmp %7, #2 \n" + "blt 3f \n" + + "cmp %7, #4 \n" + "blt 2f \n" + + // TODO(frkoenig): Clean this up + // 4x8 block + "mov %0, %1 \n" + MEMACCESS(0) + "vld1.64 {d0}, [%0], %2 \n" + MEMACCESS(0) + "vld1.64 {d1}, [%0], %2 \n" + MEMACCESS(0) + "vld1.64 {d2}, [%0], %2 \n" + MEMACCESS(0) + "vld1.64 {d3}, [%0], %2 \n" + MEMACCESS(0) + "vld1.64 {d4}, [%0], %2 \n" + MEMACCESS(0) + "vld1.64 {d5}, [%0], %2 \n" + MEMACCESS(0) + "vld1.64 {d6}, [%0], %2 \n" + MEMACCESS(0) + "vld1.64 {d7}, [%0] \n" + + MEMACCESS(8) + "vld1.8 {q15}, [%8] \n" + + "vtrn.8 q0, q1 \n" + "vtrn.8 q2, q3 \n" + + "vtbl.8 d16, {d0, d1}, d30 \n" + "vtbl.8 d17, {d0, d1}, d31 \n" + "vtbl.8 d18, {d2, d3}, d30 \n" + "vtbl.8 d19, {d2, d3}, d31 \n" + "vtbl.8 d20, {d4, d5}, d30 \n" + "vtbl.8 d21, {d4, d5}, d31 \n" + "vtbl.8 d22, {d6, d7}, d30 \n" + "vtbl.8 d23, {d6, d7}, d31 \n" + + "mov %0, %3 \n" + + MEMACCESS(0) + "vst1.32 {d16[0]}, [%0], %4 \n" + MEMACCESS(0) + "vst1.32 {d16[1]}, [%0], %4 \n" + MEMACCESS(0) + "vst1.32 {d17[0]}, [%0], %4 \n" + MEMACCESS(0) + "vst1.32 {d17[1]}, [%0], %4 \n" + + "add %0, %3, #4 \n" + MEMACCESS(0) + "vst1.32 {d20[0]}, [%0], %4 \n" + MEMACCESS(0) + "vst1.32 {d20[1]}, [%0], %4 \n" + MEMACCESS(0) + "vst1.32 {d21[0]}, [%0], %4 \n" + MEMACCESS(0) + "vst1.32 {d21[1]}, [%0] \n" + + "mov %0, %5 \n" + + MEMACCESS(0) + "vst1.32 {d18[0]}, [%0], %6 \n" + MEMACCESS(0) + "vst1.32 {d18[1]}, [%0], %6 \n" + MEMACCESS(0) + "vst1.32 {d19[0]}, [%0], %6 \n" + MEMACCESS(0) + "vst1.32 {d19[1]}, [%0], %6 \n" + + "add %0, %5, #4 \n" + MEMACCESS(0) + "vst1.32 {d22[0]}, [%0], %6 \n" + MEMACCESS(0) + "vst1.32 {d22[1]}, [%0], %6 \n" + MEMACCESS(0) + "vst1.32 {d23[0]}, [%0], %6 \n" + MEMACCESS(0) + "vst1.32 {d23[1]}, [%0] \n" + + "add %1, #4*2 \n" // src += 4 * 2 + "add %3, %3, %4, lsl #2 \n" // dst_a += 4 * dst_stride_a + "add %5, %5, %6, lsl #2 \n" // dst_b += 4 * dst_stride_b + "subs %7, #4 \n" // w -= 4 + "beq 4f \n" + + // some residual, check to see if it includes a 2x8 block, + // or less + "cmp %7, #2 \n" + "blt 3f \n" + + // 2x8 block + "2: \n" + "mov %0, %1 \n" + MEMACCESS(0) + "vld2.16 {d0[0], d2[0]}, [%0], %2 \n" + MEMACCESS(0) + "vld2.16 {d1[0], d3[0]}, [%0], %2 \n" + MEMACCESS(0) + "vld2.16 {d0[1], d2[1]}, [%0], %2 \n" + MEMACCESS(0) + "vld2.16 {d1[1], d3[1]}, [%0], %2 \n" + MEMACCESS(0) + "vld2.16 {d0[2], d2[2]}, [%0], %2 \n" + MEMACCESS(0) + "vld2.16 {d1[2], d3[2]}, [%0], %2 \n" + MEMACCESS(0) + "vld2.16 {d0[3], d2[3]}, [%0], %2 \n" + MEMACCESS(0) + "vld2.16 {d1[3], d3[3]}, [%0] \n" + + "vtrn.8 d0, d1 \n" + "vtrn.8 d2, d3 \n" + + "mov %0, %3 \n" + + MEMACCESS(0) + "vst1.64 {d0}, [%0], %4 \n" + MEMACCESS(0) + "vst1.64 {d2}, [%0] \n" + + "mov %0, %5 \n" + + MEMACCESS(0) + "vst1.64 {d1}, [%0], %6 \n" + MEMACCESS(0) + "vst1.64 {d3}, [%0] \n" + + "add %1, #2*2 \n" // src += 2 * 2 + "add %3, %3, %4, lsl #1 \n" // dst_a += 2 * dst_stride_a + "add %5, %5, %6, lsl #1 \n" // dst_b += 2 * dst_stride_b + "subs %7, #2 \n" // w -= 2 + "beq 4f \n" + + // 1x8 block + "3: \n" + MEMACCESS(1) + "vld2.8 {d0[0], d1[0]}, [%1], %2 \n" + MEMACCESS(1) + "vld2.8 {d0[1], d1[1]}, [%1], %2 \n" + MEMACCESS(1) + "vld2.8 {d0[2], d1[2]}, [%1], %2 \n" + MEMACCESS(1) + "vld2.8 {d0[3], d1[3]}, [%1], %2 \n" + MEMACCESS(1) + "vld2.8 {d0[4], d1[4]}, [%1], %2 \n" + MEMACCESS(1) + "vld2.8 {d0[5], d1[5]}, [%1], %2 \n" + MEMACCESS(1) + "vld2.8 {d0[6], d1[6]}, [%1], %2 \n" + MEMACCESS(1) + "vld2.8 {d0[7], d1[7]}, [%1] \n" + + MEMACCESS(3) + "vst1.64 {d0}, [%3] \n" + MEMACCESS(5) + "vst1.64 {d1}, [%5] \n" + + "4: \n" + + : "+r"(src_temp), // %0 + "+r"(src), // %1 + "+r"(src_stride), // %2 + "+r"(dst_a), // %3 + "+r"(dst_stride_a), // %4 + "+r"(dst_b), // %5 + "+r"(dst_stride_b), // %6 + "+r"(width) // %7 + : "r"(&kVTbl4x4TransposeDi) // %8 + : "memory", "cc", + "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11" + ); +} +#endif // HAS_TRANSPOSE_UVWX8_NEON +#endif // __aarch64__ + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/TMessagesProj/jni/libyuv/source/row_any.cc b/TMessagesProj/jni/libyuv/source/row_any.cc new file mode 100644 index 000000000..aaa0378d7 --- /dev/null +++ b/TMessagesProj/jni/libyuv/source/row_any.cc @@ -0,0 +1,602 @@ +/* + * Copyright 2012 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/row.h" + +#include "libyuv/basic_types.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// TODO(fbarchard): Consider 'any' functions handling any quantity of pixels. +// TODO(fbarchard): Consider 'any' functions handling odd alignment. +// YUV to RGB does multiple of 8 with SIMD and remainder with C. +#define YANY(NAMEANY, I420TORGB_SIMD, I420TORGB_C, UV_SHIFT, BPP, MASK) \ + void NAMEANY(const uint8* y_buf, \ + const uint8* u_buf, \ + const uint8* v_buf, \ + uint8* rgb_buf, \ + int width) { \ + int n = width & ~MASK; \ + I420TORGB_SIMD(y_buf, u_buf, v_buf, rgb_buf, n); \ + I420TORGB_C(y_buf + n, \ + u_buf + (n >> UV_SHIFT), \ + v_buf + (n >> UV_SHIFT), \ + rgb_buf + n * BPP, width & MASK); \ + } + +#ifdef HAS_I422TOARGBROW_SSSE3 +YANY(I422ToARGBRow_Any_SSSE3, I422ToARGBRow_Unaligned_SSSE3, I422ToARGBRow_C, + 1, 4, 7) +#endif // HAS_I422TOARGBROW_SSSE3 +#ifdef HAS_I444TOARGBROW_SSSE3 +YANY(I444ToARGBRow_Any_SSSE3, I444ToARGBRow_Unaligned_SSSE3, I444ToARGBRow_C, + 0, 4, 7) +YANY(I411ToARGBRow_Any_SSSE3, I411ToARGBRow_Unaligned_SSSE3, I411ToARGBRow_C, + 2, 4, 7) +YANY(I422ToBGRARow_Any_SSSE3, I422ToBGRARow_Unaligned_SSSE3, I422ToBGRARow_C, + 1, 4, 7) +YANY(I422ToABGRRow_Any_SSSE3, I422ToABGRRow_Unaligned_SSSE3, I422ToABGRRow_C, + 1, 4, 7) +YANY(I422ToRGBARow_Any_SSSE3, I422ToRGBARow_Unaligned_SSSE3, I422ToRGBARow_C, + 1, 4, 7) +// I422ToRGB565Row_SSSE3 is unaligned. +YANY(I422ToARGB4444Row_Any_SSSE3, I422ToARGB4444Row_SSSE3, I422ToARGB4444Row_C, + 1, 2, 7) +YANY(I422ToARGB1555Row_Any_SSSE3, I422ToARGB1555Row_SSSE3, I422ToARGB1555Row_C, + 1, 2, 7) +YANY(I422ToRGB565Row_Any_SSSE3, I422ToRGB565Row_SSSE3, I422ToRGB565Row_C, + 1, 2, 7) +// I422ToRGB24Row_SSSE3 is unaligned. +YANY(I422ToRGB24Row_Any_SSSE3, I422ToRGB24Row_SSSE3, I422ToRGB24Row_C, 1, 3, 7) +YANY(I422ToRAWRow_Any_SSSE3, I422ToRAWRow_SSSE3, I422ToRAWRow_C, 1, 3, 7) +YANY(I422ToYUY2Row_Any_SSE2, I422ToYUY2Row_SSE2, I422ToYUY2Row_C, 1, 2, 15) +YANY(I422ToUYVYRow_Any_SSE2, I422ToUYVYRow_SSE2, I422ToUYVYRow_C, 1, 2, 15) +#endif // HAS_I444TOARGBROW_SSSE3 +#ifdef HAS_I422TOARGBROW_AVX2 +YANY(I422ToARGBRow_Any_AVX2, I422ToARGBRow_AVX2, I422ToARGBRow_C, 1, 4, 15) +#endif // HAS_I422TOARGBROW_AVX2 +#ifdef HAS_I422TOARGBROW_NEON +YANY(I444ToARGBRow_Any_NEON, I444ToARGBRow_NEON, I444ToARGBRow_C, 0, 4, 7) +YANY(I422ToARGBRow_Any_NEON, I422ToARGBRow_NEON, I422ToARGBRow_C, 1, 4, 7) +YANY(I411ToARGBRow_Any_NEON, I411ToARGBRow_NEON, I411ToARGBRow_C, 2, 4, 7) +YANY(I422ToBGRARow_Any_NEON, I422ToBGRARow_NEON, I422ToBGRARow_C, 1, 4, 7) +YANY(I422ToABGRRow_Any_NEON, I422ToABGRRow_NEON, I422ToABGRRow_C, 1, 4, 7) +YANY(I422ToRGBARow_Any_NEON, I422ToRGBARow_NEON, I422ToRGBARow_C, 1, 4, 7) +YANY(I422ToRGB24Row_Any_NEON, I422ToRGB24Row_NEON, I422ToRGB24Row_C, 1, 3, 7) +YANY(I422ToRAWRow_Any_NEON, I422ToRAWRow_NEON, I422ToRAWRow_C, 1, 3, 7) +YANY(I422ToARGB4444Row_Any_NEON, I422ToARGB4444Row_NEON, I422ToARGB4444Row_C, + 1, 2, 7) +YANY(I422ToARGB1555Row_Any_NEON, I422ToARGB1555Row_NEON, I422ToARGB1555Row_C, + 1, 2, 7) +YANY(I422ToRGB565Row_Any_NEON, I422ToRGB565Row_NEON, I422ToRGB565Row_C, 1, 2, 7) +#endif // HAS_I422TOARGBROW_NEON +#ifdef HAS_I422TOYUY2ROW_NEON +YANY(I422ToYUY2Row_Any_NEON, I422ToYUY2Row_NEON, I422ToYUY2Row_C, 1, 2, 15) +#endif // HAS_I422TOYUY2ROW_NEON +#ifdef HAS_I422TOUYVYROW_NEON +YANY(I422ToUYVYRow_Any_NEON, I422ToUYVYRow_NEON, I422ToUYVYRow_C, 1, 2, 15) +#endif // HAS_I422TOUYVYROW_NEON +#undef YANY + +// Wrappers to handle odd width +#define NV2NY(NAMEANY, NV12TORGB_SIMD, NV12TORGB_C, UV_SHIFT, BPP) \ + void NAMEANY(const uint8* y_buf, \ + const uint8* uv_buf, \ + uint8* rgb_buf, \ + int width) { \ + int n = width & ~7; \ + NV12TORGB_SIMD(y_buf, uv_buf, rgb_buf, n); \ + NV12TORGB_C(y_buf + n, \ + uv_buf + (n >> UV_SHIFT), \ + rgb_buf + n * BPP, width & 7); \ + } + +#ifdef HAS_NV12TOARGBROW_SSSE3 +NV2NY(NV12ToARGBRow_Any_SSSE3, NV12ToARGBRow_Unaligned_SSSE3, NV12ToARGBRow_C, + 0, 4) +NV2NY(NV21ToARGBRow_Any_SSSE3, NV21ToARGBRow_Unaligned_SSSE3, NV21ToARGBRow_C, + 0, 4) +#endif // HAS_NV12TOARGBROW_SSSE3 +#ifdef HAS_NV12TOARGBROW_NEON +NV2NY(NV12ToARGBRow_Any_NEON, NV12ToARGBRow_NEON, NV12ToARGBRow_C, 0, 4) +NV2NY(NV21ToARGBRow_Any_NEON, NV21ToARGBRow_NEON, NV21ToARGBRow_C, 0, 4) +#endif // HAS_NV12TOARGBROW_NEON +#ifdef HAS_NV12TORGB565ROW_SSSE3 +NV2NY(NV12ToRGB565Row_Any_SSSE3, NV12ToRGB565Row_SSSE3, NV12ToRGB565Row_C, + 0, 2) +NV2NY(NV21ToRGB565Row_Any_SSSE3, NV21ToRGB565Row_SSSE3, NV21ToRGB565Row_C, + 0, 2) +#endif // HAS_NV12TORGB565ROW_SSSE3 +#ifdef HAS_NV12TORGB565ROW_NEON +NV2NY(NV12ToRGB565Row_Any_NEON, NV12ToRGB565Row_NEON, NV12ToRGB565Row_C, 0, 2) +NV2NY(NV21ToRGB565Row_Any_NEON, NV21ToRGB565Row_NEON, NV21ToRGB565Row_C, 0, 2) +#endif // HAS_NV12TORGB565ROW_NEON +#undef NVANY + +#define RGBANY(NAMEANY, ARGBTORGB_SIMD, ARGBTORGB_C, MASK, SBPP, BPP) \ + void NAMEANY(const uint8* src, \ + uint8* dst, \ + int width) { \ + int n = width & ~MASK; \ + ARGBTORGB_SIMD(src, dst, n); \ + ARGBTORGB_C(src + n * SBPP, dst + n * BPP, width & MASK); \ + } + +#if defined(HAS_ARGBTORGB24ROW_SSSE3) +RGBANY(ARGBToRGB24Row_Any_SSSE3, ARGBToRGB24Row_SSSE3, ARGBToRGB24Row_C, + 15, 4, 3) +RGBANY(ARGBToRAWRow_Any_SSSE3, ARGBToRAWRow_SSSE3, ARGBToRAWRow_C, + 15, 4, 3) +RGBANY(ARGBToRGB565Row_Any_SSE2, ARGBToRGB565Row_SSE2, ARGBToRGB565Row_C, + 3, 4, 2) +RGBANY(ARGBToARGB1555Row_Any_SSE2, ARGBToARGB1555Row_SSE2, ARGBToARGB1555Row_C, + 3, 4, 2) +RGBANY(ARGBToARGB4444Row_Any_SSE2, ARGBToARGB4444Row_SSE2, ARGBToARGB4444Row_C, + 3, 4, 2) +#endif +#if defined(HAS_I400TOARGBROW_SSE2) +RGBANY(I400ToARGBRow_Any_SSE2, I400ToARGBRow_Unaligned_SSE2, I400ToARGBRow_C, + 7, 1, 4) +#endif +#if defined(HAS_YTOARGBROW_SSE2) +RGBANY(YToARGBRow_Any_SSE2, YToARGBRow_SSE2, YToARGBRow_C, + 7, 1, 4) +RGBANY(YUY2ToARGBRow_Any_SSSE3, YUY2ToARGBRow_Unaligned_SSSE3, YUY2ToARGBRow_C, + 15, 2, 4) +RGBANY(UYVYToARGBRow_Any_SSSE3, UYVYToARGBRow_Unaligned_SSSE3, UYVYToARGBRow_C, + 15, 2, 4) +// These require alignment on ARGB, so C is used for remainder. +RGBANY(RGB24ToARGBRow_Any_SSSE3, RGB24ToARGBRow_SSSE3, RGB24ToARGBRow_C, + 15, 3, 4) +RGBANY(RAWToARGBRow_Any_SSSE3, RAWToARGBRow_SSSE3, RAWToARGBRow_C, + 15, 3, 4) +RGBANY(RGB565ToARGBRow_Any_SSE2, RGB565ToARGBRow_SSE2, RGB565ToARGBRow_C, + 7, 2, 4) +RGBANY(ARGB1555ToARGBRow_Any_SSE2, ARGB1555ToARGBRow_SSE2, ARGB1555ToARGBRow_C, + 7, 2, 4) +RGBANY(ARGB4444ToARGBRow_Any_SSE2, ARGB4444ToARGBRow_SSE2, ARGB4444ToARGBRow_C, + 7, 2, 4) +#endif +#if defined(HAS_ARGBTORGB24ROW_NEON) +RGBANY(ARGBToRGB24Row_Any_NEON, ARGBToRGB24Row_NEON, ARGBToRGB24Row_C, 7, 4, 3) +RGBANY(ARGBToRAWRow_Any_NEON, ARGBToRAWRow_NEON, ARGBToRAWRow_C, 7, 4, 3) +RGBANY(ARGBToRGB565Row_Any_NEON, ARGBToRGB565Row_NEON, ARGBToRGB565Row_C, + 7, 4, 2) +RGBANY(ARGBToARGB1555Row_Any_NEON, ARGBToARGB1555Row_NEON, ARGBToARGB1555Row_C, + 7, 4, 2) +RGBANY(ARGBToARGB4444Row_Any_NEON, ARGBToARGB4444Row_NEON, ARGBToARGB4444Row_C, + 7, 4, 2) +RGBANY(I400ToARGBRow_Any_NEON, I400ToARGBRow_NEON, I400ToARGBRow_C, + 7, 1, 4) +RGBANY(YToARGBRow_Any_NEON, YToARGBRow_NEON, YToARGBRow_C, + 7, 1, 4) +RGBANY(YUY2ToARGBRow_Any_NEON, YUY2ToARGBRow_NEON, YUY2ToARGBRow_C, + 7, 2, 4) +RGBANY(UYVYToARGBRow_Any_NEON, UYVYToARGBRow_NEON, UYVYToARGBRow_C, + 7, 2, 4) +#endif +#undef RGBANY + +// ARGB to Bayer does multiple of 4 pixels, SSSE3 aligned src, unaligned dst. +#define BAYERANY(NAMEANY, ARGBTORGB_SIMD, ARGBTORGB_C, MASK, SBPP, BPP) \ + void NAMEANY(const uint8* src, \ + uint8* dst, uint32 selector, \ + int width) { \ + int n = width & ~MASK; \ + ARGBTORGB_SIMD(src, dst, selector, n); \ + ARGBTORGB_C(src + n * SBPP, dst + n * BPP, selector, width & MASK); \ + } + +#if defined(HAS_ARGBTOBAYERROW_SSSE3) +BAYERANY(ARGBToBayerRow_Any_SSSE3, ARGBToBayerRow_SSSE3, ARGBToBayerRow_C, + 7, 4, 1) +#endif +#if defined(HAS_ARGBTOBAYERROW_NEON) +BAYERANY(ARGBToBayerRow_Any_NEON, ARGBToBayerRow_NEON, ARGBToBayerRow_C, + 7, 4, 1) +#endif +#if defined(HAS_ARGBTOBAYERGGROW_SSE2) +BAYERANY(ARGBToBayerGGRow_Any_SSE2, ARGBToBayerGGRow_SSE2, ARGBToBayerGGRow_C, + 7, 4, 1) +#endif +#if defined(HAS_ARGBTOBAYERGGROW_NEON) +BAYERANY(ARGBToBayerGGRow_Any_NEON, ARGBToBayerGGRow_NEON, ARGBToBayerGGRow_C, + 7, 4, 1) +#endif + +#undef BAYERANY + +// RGB/YUV to Y does multiple of 16 with SIMD and last 16 with SIMD. +#define YANY(NAMEANY, ARGBTOY_SIMD, SBPP, BPP, NUM) \ + void NAMEANY(const uint8* src_argb, uint8* dst_y, int width) { \ + ARGBTOY_SIMD(src_argb, dst_y, width - NUM); \ + ARGBTOY_SIMD(src_argb + (width - NUM) * SBPP, \ + dst_y + (width - NUM) * BPP, NUM); \ + } + +#ifdef HAS_ARGBTOYROW_AVX2 +YANY(ARGBToYRow_Any_AVX2, ARGBToYRow_AVX2, 4, 1, 32) +YANY(ARGBToYJRow_Any_AVX2, ARGBToYJRow_AVX2, 4, 1, 32) +YANY(YUY2ToYRow_Any_AVX2, YUY2ToYRow_AVX2, 2, 1, 32) +YANY(UYVYToYRow_Any_AVX2, UYVYToYRow_AVX2, 2, 1, 32) +#endif +#ifdef HAS_ARGBTOYROW_SSSE3 +YANY(ARGBToYRow_Any_SSSE3, ARGBToYRow_Unaligned_SSSE3, 4, 1, 16) +#endif +#ifdef HAS_BGRATOYROW_SSSE3 +YANY(BGRAToYRow_Any_SSSE3, BGRAToYRow_Unaligned_SSSE3, 4, 1, 16) +YANY(ABGRToYRow_Any_SSSE3, ABGRToYRow_Unaligned_SSSE3, 4, 1, 16) +YANY(RGBAToYRow_Any_SSSE3, RGBAToYRow_Unaligned_SSSE3, 4, 1, 16) +YANY(YUY2ToYRow_Any_SSE2, YUY2ToYRow_Unaligned_SSE2, 2, 1, 16) +YANY(UYVYToYRow_Any_SSE2, UYVYToYRow_Unaligned_SSE2, 2, 1, 16) +#endif +#ifdef HAS_ARGBTOYJROW_SSSE3 +YANY(ARGBToYJRow_Any_SSSE3, ARGBToYJRow_Unaligned_SSSE3, 4, 1, 16) +#endif +#ifdef HAS_ARGBTOYROW_NEON +YANY(ARGBToYRow_Any_NEON, ARGBToYRow_NEON, 4, 1, 8) +#endif +#ifdef HAS_ARGBTOYJROW_NEON +YANY(ARGBToYJRow_Any_NEON, ARGBToYJRow_NEON, 4, 1, 8) +#endif +#ifdef HAS_BGRATOYROW_NEON +YANY(BGRAToYRow_Any_NEON, BGRAToYRow_NEON, 4, 1, 8) +#endif +#ifdef HAS_ABGRTOYROW_NEON +YANY(ABGRToYRow_Any_NEON, ABGRToYRow_NEON, 4, 1, 8) +#endif +#ifdef HAS_RGBATOYROW_NEON +YANY(RGBAToYRow_Any_NEON, RGBAToYRow_NEON, 4, 1, 8) +#endif +#ifdef HAS_RGB24TOYROW_NEON +YANY(RGB24ToYRow_Any_NEON, RGB24ToYRow_NEON, 3, 1, 8) +#endif +#ifdef HAS_RAWTOYROW_NEON +YANY(RAWToYRow_Any_NEON, RAWToYRow_NEON, 3, 1, 8) +#endif +#ifdef HAS_RGB565TOYROW_NEON +YANY(RGB565ToYRow_Any_NEON, RGB565ToYRow_NEON, 2, 1, 8) +#endif +#ifdef HAS_ARGB1555TOYROW_NEON +YANY(ARGB1555ToYRow_Any_NEON, ARGB1555ToYRow_NEON, 2, 1, 8) +#endif +#ifdef HAS_ARGB4444TOYROW_NEON +YANY(ARGB4444ToYRow_Any_NEON, ARGB4444ToYRow_NEON, 2, 1, 8) +#endif +#ifdef HAS_YUY2TOYROW_NEON +YANY(YUY2ToYRow_Any_NEON, YUY2ToYRow_NEON, 2, 1, 16) +#endif +#ifdef HAS_UYVYTOYROW_NEON +YANY(UYVYToYRow_Any_NEON, UYVYToYRow_NEON, 2, 1, 16) +#endif +#ifdef HAS_RGB24TOARGBROW_NEON +YANY(RGB24ToARGBRow_Any_NEON, RGB24ToARGBRow_NEON, 3, 4, 8) +#endif +#ifdef HAS_RAWTOARGBROW_NEON +YANY(RAWToARGBRow_Any_NEON, RAWToARGBRow_NEON, 3, 4, 8) +#endif +#ifdef HAS_RGB565TOARGBROW_NEON +YANY(RGB565ToARGBRow_Any_NEON, RGB565ToARGBRow_NEON, 2, 4, 8) +#endif +#ifdef HAS_ARGB1555TOARGBROW_NEON +YANY(ARGB1555ToARGBRow_Any_NEON, ARGB1555ToARGBRow_NEON, 2, 4, 8) +#endif +#ifdef HAS_ARGB4444TOARGBROW_NEON +YANY(ARGB4444ToARGBRow_Any_NEON, ARGB4444ToARGBRow_NEON, 2, 4, 8) +#endif +#undef YANY + +#define YANY(NAMEANY, ARGBTOY_SIMD, ARGBTOY_C, SBPP, BPP, MASK) \ + void NAMEANY(const uint8* src_argb, uint8* dst_y, int width) { \ + int n = width & ~MASK; \ + ARGBTOY_SIMD(src_argb, dst_y, n); \ + ARGBTOY_C(src_argb + n * SBPP, \ + dst_y + n * BPP, width & MASK); \ + } + +// Attenuate is destructive so last16 method can not be used due to overlap. +#ifdef HAS_ARGBATTENUATEROW_SSSE3 +YANY(ARGBAttenuateRow_Any_SSSE3, ARGBAttenuateRow_SSSE3, ARGBAttenuateRow_C, + 4, 4, 3) +#endif +#ifdef HAS_ARGBATTENUATEROW_SSE2 +YANY(ARGBAttenuateRow_Any_SSE2, ARGBAttenuateRow_SSE2, ARGBAttenuateRow_C, + 4, 4, 3) +#endif +#ifdef HAS_ARGBUNATTENUATEROW_SSE2 +YANY(ARGBUnattenuateRow_Any_SSE2, ARGBUnattenuateRow_SSE2, ARGBUnattenuateRow_C, + 4, 4, 3) +#endif +#ifdef HAS_ARGBATTENUATEROW_AVX2 +YANY(ARGBAttenuateRow_Any_AVX2, ARGBAttenuateRow_AVX2, ARGBAttenuateRow_C, + 4, 4, 7) +#endif +#ifdef HAS_ARGBUNATTENUATEROW_AVX2 +YANY(ARGBUnattenuateRow_Any_AVX2, ARGBUnattenuateRow_AVX2, ARGBUnattenuateRow_C, + 4, 4, 7) +#endif +#ifdef HAS_ARGBATTENUATEROW_NEON +YANY(ARGBAttenuateRow_Any_NEON, ARGBAttenuateRow_NEON, ARGBAttenuateRow_C, + 4, 4, 7) +#endif +#undef YANY + +// RGB/YUV to UV does multiple of 16 with SIMD and remainder with C. +#define UVANY(NAMEANY, ANYTOUV_SIMD, ANYTOUV_C, BPP, MASK) \ + void NAMEANY(const uint8* src_argb, int src_stride_argb, \ + uint8* dst_u, uint8* dst_v, int width) { \ + int n = width & ~MASK; \ + ANYTOUV_SIMD(src_argb, src_stride_argb, dst_u, dst_v, n); \ + ANYTOUV_C(src_argb + n * BPP, src_stride_argb, \ + dst_u + (n >> 1), \ + dst_v + (n >> 1), \ + width & MASK); \ + } + +#ifdef HAS_ARGBTOUVROW_AVX2 +UVANY(ARGBToUVRow_Any_AVX2, ARGBToUVRow_AVX2, ARGBToUVRow_C, 4, 31) +UVANY(YUY2ToUVRow_Any_AVX2, YUY2ToUVRow_AVX2, YUY2ToUVRow_C, 2, 31) +UVANY(UYVYToUVRow_Any_AVX2, UYVYToUVRow_AVX2, UYVYToUVRow_C, 2, 31) +#endif +#ifdef HAS_ARGBTOUVROW_SSSE3 +UVANY(ARGBToUVRow_Any_SSSE3, ARGBToUVRow_Unaligned_SSSE3, ARGBToUVRow_C, 4, 15) +UVANY(ARGBToUVJRow_Any_SSSE3, ARGBToUVJRow_Unaligned_SSSE3, ARGBToUVJRow_C, + 4, 15) +UVANY(BGRAToUVRow_Any_SSSE3, BGRAToUVRow_Unaligned_SSSE3, BGRAToUVRow_C, 4, 15) +UVANY(ABGRToUVRow_Any_SSSE3, ABGRToUVRow_Unaligned_SSSE3, ABGRToUVRow_C, 4, 15) +UVANY(RGBAToUVRow_Any_SSSE3, RGBAToUVRow_Unaligned_SSSE3, RGBAToUVRow_C, 4, 15) +UVANY(YUY2ToUVRow_Any_SSE2, YUY2ToUVRow_Unaligned_SSE2, YUY2ToUVRow_C, 2, 15) +UVANY(UYVYToUVRow_Any_SSE2, UYVYToUVRow_Unaligned_SSE2, UYVYToUVRow_C, 2, 15) +#endif +#ifdef HAS_ARGBTOUVROW_NEON +UVANY(ARGBToUVRow_Any_NEON, ARGBToUVRow_NEON, ARGBToUVRow_C, 4, 15) +#endif +#ifdef HAS_ARGBTOUVJROW_NEON +UVANY(ARGBToUVJRow_Any_NEON, ARGBToUVJRow_NEON, ARGBToUVJRow_C, 4, 15) +#endif +#ifdef HAS_BGRATOUVROW_NEON +UVANY(BGRAToUVRow_Any_NEON, BGRAToUVRow_NEON, BGRAToUVRow_C, 4, 15) +#endif +#ifdef HAS_ABGRTOUVROW_NEON +UVANY(ABGRToUVRow_Any_NEON, ABGRToUVRow_NEON, ABGRToUVRow_C, 4, 15) +#endif +#ifdef HAS_RGBATOUVROW_NEON +UVANY(RGBAToUVRow_Any_NEON, RGBAToUVRow_NEON, RGBAToUVRow_C, 4, 15) +#endif +#ifdef HAS_RGB24TOUVROW_NEON +UVANY(RGB24ToUVRow_Any_NEON, RGB24ToUVRow_NEON, RGB24ToUVRow_C, 3, 15) +#endif +#ifdef HAS_RAWTOUVROW_NEON +UVANY(RAWToUVRow_Any_NEON, RAWToUVRow_NEON, RAWToUVRow_C, 3, 15) +#endif +#ifdef HAS_RGB565TOUVROW_NEON +UVANY(RGB565ToUVRow_Any_NEON, RGB565ToUVRow_NEON, RGB565ToUVRow_C, 2, 15) +#endif +#ifdef HAS_ARGB1555TOUVROW_NEON +UVANY(ARGB1555ToUVRow_Any_NEON, ARGB1555ToUVRow_NEON, ARGB1555ToUVRow_C, 2, 15) +#endif +#ifdef HAS_ARGB4444TOUVROW_NEON +UVANY(ARGB4444ToUVRow_Any_NEON, ARGB4444ToUVRow_NEON, ARGB4444ToUVRow_C, 2, 15) +#endif +#ifdef HAS_YUY2TOUVROW_NEON +UVANY(YUY2ToUVRow_Any_NEON, YUY2ToUVRow_NEON, YUY2ToUVRow_C, 2, 15) +#endif +#ifdef HAS_UYVYTOUVROW_NEON +UVANY(UYVYToUVRow_Any_NEON, UYVYToUVRow_NEON, UYVYToUVRow_C, 2, 15) +#endif +#undef UVANY + +#define UV422ANY(NAMEANY, ANYTOUV_SIMD, ANYTOUV_C, BPP, MASK, SHIFT) \ + void NAMEANY(const uint8* src_uv, \ + uint8* dst_u, uint8* dst_v, int width) { \ + int n = width & ~MASK; \ + ANYTOUV_SIMD(src_uv, dst_u, dst_v, n); \ + ANYTOUV_C(src_uv + n * BPP, \ + dst_u + (n >> SHIFT), \ + dst_v + (n >> SHIFT), \ + width & MASK); \ + } + +#ifdef HAS_ARGBTOUV444ROW_SSSE3 +UV422ANY(ARGBToUV444Row_Any_SSSE3, ARGBToUV444Row_Unaligned_SSSE3, + ARGBToUV444Row_C, 4, 15, 0) +#endif +#ifdef HAS_YUY2TOUV422ROW_AVX2 +UV422ANY(YUY2ToUV422Row_Any_AVX2, YUY2ToUV422Row_AVX2, + YUY2ToUV422Row_C, 2, 31, 1) +UV422ANY(UYVYToUV422Row_Any_AVX2, UYVYToUV422Row_AVX2, + UYVYToUV422Row_C, 2, 31, 1) +#endif +#ifdef HAS_ARGBTOUVROW_SSSE3 +UV422ANY(ARGBToUV422Row_Any_SSSE3, ARGBToUV422Row_Unaligned_SSSE3, + ARGBToUV422Row_C, 4, 15, 1) +UV422ANY(YUY2ToUV422Row_Any_SSE2, YUY2ToUV422Row_Unaligned_SSE2, + YUY2ToUV422Row_C, 2, 15, 1) +UV422ANY(UYVYToUV422Row_Any_SSE2, UYVYToUV422Row_Unaligned_SSE2, + UYVYToUV422Row_C, 2, 15, 1) +#endif +#ifdef HAS_YUY2TOUV422ROW_NEON +UV422ANY(ARGBToUV444Row_Any_NEON, ARGBToUV444Row_NEON, + ARGBToUV444Row_C, 4, 7, 0) +UV422ANY(ARGBToUV422Row_Any_NEON, ARGBToUV422Row_NEON, + ARGBToUV422Row_C, 4, 15, 1) +UV422ANY(ARGBToUV411Row_Any_NEON, ARGBToUV411Row_NEON, + ARGBToUV411Row_C, 4, 31, 2) +UV422ANY(YUY2ToUV422Row_Any_NEON, YUY2ToUV422Row_NEON, + YUY2ToUV422Row_C, 2, 15, 1) +UV422ANY(UYVYToUV422Row_Any_NEON, UYVYToUV422Row_NEON, + UYVYToUV422Row_C, 2, 15, 1) +#endif +#undef UV422ANY + +#define SPLITUVROWANY(NAMEANY, ANYTOUV_SIMD, ANYTOUV_C, MASK) \ + void NAMEANY(const uint8* src_uv, \ + uint8* dst_u, uint8* dst_v, int width) { \ + int n = width & ~MASK; \ + ANYTOUV_SIMD(src_uv, dst_u, dst_v, n); \ + ANYTOUV_C(src_uv + n * 2, \ + dst_u + n, \ + dst_v + n, \ + width & MASK); \ + } + +#ifdef HAS_SPLITUVROW_SSE2 +SPLITUVROWANY(SplitUVRow_Any_SSE2, SplitUVRow_Unaligned_SSE2, SplitUVRow_C, 15) +#endif +#ifdef HAS_SPLITUVROW_AVX2 +SPLITUVROWANY(SplitUVRow_Any_AVX2, SplitUVRow_AVX2, SplitUVRow_C, 31) +#endif +#ifdef HAS_SPLITUVROW_NEON +SPLITUVROWANY(SplitUVRow_Any_NEON, SplitUVRow_NEON, SplitUVRow_C, 15) +#endif +#ifdef HAS_SPLITUVROW_MIPS_DSPR2 +SPLITUVROWANY(SplitUVRow_Any_MIPS_DSPR2, SplitUVRow_Unaligned_MIPS_DSPR2, + SplitUVRow_C, 15) +#endif +#undef SPLITUVROWANY + +#define MERGEUVROW_ANY(NAMEANY, ANYTOUV_SIMD, ANYTOUV_C, MASK) \ + void NAMEANY(const uint8* src_u, const uint8* src_v, \ + uint8* dst_uv, int width) { \ + int n = width & ~MASK; \ + ANYTOUV_SIMD(src_u, src_v, dst_uv, n); \ + ANYTOUV_C(src_u + n, \ + src_v + n, \ + dst_uv + n * 2, \ + width & MASK); \ + } + +#ifdef HAS_MERGEUVROW_SSE2 +MERGEUVROW_ANY(MergeUVRow_Any_SSE2, MergeUVRow_Unaligned_SSE2, MergeUVRow_C, 15) +#endif +#ifdef HAS_MERGEUVROW_AVX2 +MERGEUVROW_ANY(MergeUVRow_Any_AVX2, MergeUVRow_AVX2, MergeUVRow_C, 31) +#endif +#ifdef HAS_MERGEUVROW_NEON +MERGEUVROW_ANY(MergeUVRow_Any_NEON, MergeUVRow_NEON, MergeUVRow_C, 15) +#endif +#undef MERGEUVROW_ANY + +#define MATHROW_ANY(NAMEANY, ARGBMATH_SIMD, ARGBMATH_C, MASK) \ + void NAMEANY(const uint8* src_argb0, const uint8* src_argb1, \ + uint8* dst_argb, int width) { \ + int n = width & ~MASK; \ + ARGBMATH_SIMD(src_argb0, src_argb1, dst_argb, n); \ + ARGBMATH_C(src_argb0 + n * 4, \ + src_argb1 + n * 4, \ + dst_argb + n * 4, \ + width & MASK); \ + } + +#ifdef HAS_ARGBMULTIPLYROW_SSE2 +MATHROW_ANY(ARGBMultiplyRow_Any_SSE2, ARGBMultiplyRow_SSE2, ARGBMultiplyRow_C, + 3) +#endif +#ifdef HAS_ARGBADDROW_SSE2 +MATHROW_ANY(ARGBAddRow_Any_SSE2, ARGBAddRow_SSE2, ARGBAddRow_C, 3) +#endif +#ifdef HAS_ARGBSUBTRACTROW_SSE2 +MATHROW_ANY(ARGBSubtractRow_Any_SSE2, ARGBSubtractRow_SSE2, ARGBSubtractRow_C, + 3) +#endif +#ifdef HAS_ARGBMULTIPLYROW_AVX2 +MATHROW_ANY(ARGBMultiplyRow_Any_AVX2, ARGBMultiplyRow_AVX2, ARGBMultiplyRow_C, + 7) +#endif +#ifdef HAS_ARGBADDROW_AVX2 +MATHROW_ANY(ARGBAddRow_Any_AVX2, ARGBAddRow_AVX2, ARGBAddRow_C, 7) +#endif +#ifdef HAS_ARGBSUBTRACTROW_AVX2 +MATHROW_ANY(ARGBSubtractRow_Any_AVX2, ARGBSubtractRow_AVX2, ARGBSubtractRow_C, + 7) +#endif +#ifdef HAS_ARGBMULTIPLYROW_NEON +MATHROW_ANY(ARGBMultiplyRow_Any_NEON, ARGBMultiplyRow_NEON, ARGBMultiplyRow_C, + 7) +#endif +#ifdef HAS_ARGBADDROW_NEON +MATHROW_ANY(ARGBAddRow_Any_NEON, ARGBAddRow_NEON, ARGBAddRow_C, 7) +#endif +#ifdef HAS_ARGBSUBTRACTROW_NEON +MATHROW_ANY(ARGBSubtractRow_Any_NEON, ARGBSubtractRow_NEON, ARGBSubtractRow_C, + 7) +#endif +#undef MATHROW_ANY + +// Shuffle may want to work in place, so last16 method can not be used. +#define YANY(NAMEANY, ARGBTOY_SIMD, ARGBTOY_C, SBPP, BPP, MASK) \ + void NAMEANY(const uint8* src_argb, uint8* dst_argb, \ + const uint8* shuffler, int width) { \ + int n = width & ~MASK; \ + ARGBTOY_SIMD(src_argb, dst_argb, shuffler, n); \ + ARGBTOY_C(src_argb + n * SBPP, \ + dst_argb + n * BPP, shuffler, width & MASK); \ + } + +#ifdef HAS_ARGBSHUFFLEROW_SSE2 +YANY(ARGBShuffleRow_Any_SSE2, ARGBShuffleRow_SSE2, + ARGBShuffleRow_C, 4, 4, 3) +#endif +#ifdef HAS_ARGBSHUFFLEROW_SSSE3 +YANY(ARGBShuffleRow_Any_SSSE3, ARGBShuffleRow_Unaligned_SSSE3, + ARGBShuffleRow_C, 4, 4, 7) +#endif +#ifdef HAS_ARGBSHUFFLEROW_AVX2 +YANY(ARGBShuffleRow_Any_AVX2, ARGBShuffleRow_AVX2, + ARGBShuffleRow_C, 4, 4, 15) +#endif +#ifdef HAS_ARGBSHUFFLEROW_NEON +YANY(ARGBShuffleRow_Any_NEON, ARGBShuffleRow_NEON, + ARGBShuffleRow_C, 4, 4, 3) +#endif +#undef YANY + +// Interpolate may want to work in place, so last16 method can not be used. +#define NANY(NAMEANY, TERP_SIMD, TERP_C, SBPP, BPP, MASK) \ + void NAMEANY(uint8* dst_ptr, const uint8* src_ptr, \ + ptrdiff_t src_stride_ptr, int width, \ + int source_y_fraction) { \ + int n = width & ~MASK; \ + TERP_SIMD(dst_ptr, src_ptr, src_stride_ptr, \ + n, source_y_fraction); \ + TERP_C(dst_ptr + n * BPP, \ + src_ptr + n * SBPP, src_stride_ptr, \ + width & MASK, source_y_fraction); \ + } + +#ifdef HAS_INTERPOLATEROW_AVX2 +NANY(InterpolateRow_Any_AVX2, InterpolateRow_AVX2, + InterpolateRow_C, 1, 1, 32) +#endif +#ifdef HAS_INTERPOLATEROW_SSSE3 +NANY(InterpolateRow_Any_SSSE3, InterpolateRow_Unaligned_SSSE3, + InterpolateRow_C, 1, 1, 15) +#endif +#ifdef HAS_INTERPOLATEROW_SSE2 +NANY(InterpolateRow_Any_SSE2, InterpolateRow_Unaligned_SSE2, + InterpolateRow_C, 1, 1, 15) +#endif +#ifdef HAS_INTERPOLATEROW_NEON +NANY(InterpolateRow_Any_NEON, InterpolateRow_NEON, + InterpolateRow_C, 1, 1, 15) +#endif +#ifdef HAS_INTERPOLATEROW_MIPS_DSPR2 +NANY(InterpolateRow_Any_MIPS_DSPR2, InterpolateRow_MIPS_DSPR2, + InterpolateRow_C, 1, 1, 3) +#endif +#undef NANY + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/TMessagesProj/jni/libyuv/source/row_common.cc b/TMessagesProj/jni/libyuv/source/row_common.cc new file mode 100644 index 000000000..fa2b752a2 --- /dev/null +++ b/TMessagesProj/jni/libyuv/source/row_common.cc @@ -0,0 +1,2286 @@ +/* + * Copyright 2011 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/row.h" + +#include // For memcpy and memset. + +#include "libyuv/basic_types.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// llvm x86 is poor at ternary operator, so use branchless min/max. + +#define USE_BRANCHLESS 1 +#if USE_BRANCHLESS +static __inline int32 clamp0(int32 v) { + return ((-(v) >> 31) & (v)); +} + +static __inline int32 clamp255(int32 v) { + return (((255 - (v)) >> 31) | (v)) & 255; +} + +static __inline uint32 Clamp(int32 val) { + int v = clamp0(val); + return (uint32)(clamp255(v)); +} + +static __inline uint32 Abs(int32 v) { + int m = v >> 31; + return (v + m) ^ m; +} +#else // USE_BRANCHLESS +static __inline int32 clamp0(int32 v) { + return (v < 0) ? 0 : v; +} + +static __inline int32 clamp255(int32 v) { + return (v > 255) ? 255 : v; +} + +static __inline uint32 Clamp(int32 val) { + int v = clamp0(val); + return (uint32)(clamp255(v)); +} + +static __inline uint32 Abs(int32 v) { + return (v < 0) ? -v : v; +} +#endif // USE_BRANCHLESS + +#ifdef LIBYUV_LITTLE_ENDIAN +#define WRITEWORD(p, v) *(uint32*)(p) = v +#else +static inline void WRITEWORD(uint8* p, uint32 v) { + p[0] = (uint8)(v & 255); + p[1] = (uint8)((v >> 8) & 255); + p[2] = (uint8)((v >> 16) & 255); + p[3] = (uint8)((v >> 24) & 255); +} +#endif + +void RGB24ToARGBRow_C(const uint8* src_rgb24, uint8* dst_argb, int width) { + int x; + for (x = 0; x < width; ++x) { + uint8 b = src_rgb24[0]; + uint8 g = src_rgb24[1]; + uint8 r = src_rgb24[2]; + dst_argb[0] = b; + dst_argb[1] = g; + dst_argb[2] = r; + dst_argb[3] = 255u; + dst_argb += 4; + src_rgb24 += 3; + } +} + +void RAWToARGBRow_C(const uint8* src_raw, uint8* dst_argb, int width) { + int x; + for (x = 0; x < width; ++x) { + uint8 r = src_raw[0]; + uint8 g = src_raw[1]; + uint8 b = src_raw[2]; + dst_argb[0] = b; + dst_argb[1] = g; + dst_argb[2] = r; + dst_argb[3] = 255u; + dst_argb += 4; + src_raw += 3; + } +} + +void RGB565ToARGBRow_C(const uint8* src_rgb565, uint8* dst_argb, int width) { + int x; + for (x = 0; x < width; ++x) { + uint8 b = src_rgb565[0] & 0x1f; + uint8 g = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3); + uint8 r = src_rgb565[1] >> 3; + dst_argb[0] = (b << 3) | (b >> 2); + dst_argb[1] = (g << 2) | (g >> 4); + dst_argb[2] = (r << 3) | (r >> 2); + dst_argb[3] = 255u; + dst_argb += 4; + src_rgb565 += 2; + } +} + +void ARGB1555ToARGBRow_C(const uint8* src_argb1555, uint8* dst_argb, + int width) { + int x; + for (x = 0; x < width; ++x) { + uint8 b = src_argb1555[0] & 0x1f; + uint8 g = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3); + uint8 r = (src_argb1555[1] & 0x7c) >> 2; + uint8 a = src_argb1555[1] >> 7; + dst_argb[0] = (b << 3) | (b >> 2); + dst_argb[1] = (g << 3) | (g >> 2); + dst_argb[2] = (r << 3) | (r >> 2); + dst_argb[3] = -a; + dst_argb += 4; + src_argb1555 += 2; + } +} + +void ARGB4444ToARGBRow_C(const uint8* src_argb4444, uint8* dst_argb, + int width) { + int x; + for (x = 0; x < width; ++x) { + uint8 b = src_argb4444[0] & 0x0f; + uint8 g = src_argb4444[0] >> 4; + uint8 r = src_argb4444[1] & 0x0f; + uint8 a = src_argb4444[1] >> 4; + dst_argb[0] = (b << 4) | b; + dst_argb[1] = (g << 4) | g; + dst_argb[2] = (r << 4) | r; + dst_argb[3] = (a << 4) | a; + dst_argb += 4; + src_argb4444 += 2; + } +} + +void ARGBToRGB24Row_C(const uint8* src_argb, uint8* dst_rgb, int width) { + int x; + for (x = 0; x < width; ++x) { + uint8 b = src_argb[0]; + uint8 g = src_argb[1]; + uint8 r = src_argb[2]; + dst_rgb[0] = b; + dst_rgb[1] = g; + dst_rgb[2] = r; + dst_rgb += 3; + src_argb += 4; + } +} + +void ARGBToRAWRow_C(const uint8* src_argb, uint8* dst_rgb, int width) { + int x; + for (x = 0; x < width; ++x) { + uint8 b = src_argb[0]; + uint8 g = src_argb[1]; + uint8 r = src_argb[2]; + dst_rgb[0] = r; + dst_rgb[1] = g; + dst_rgb[2] = b; + dst_rgb += 3; + src_argb += 4; + } +} + +void ARGBToRGB565Row_C(const uint8* src_argb, uint8* dst_rgb, int width) { + int x; + for (x = 0; x < width - 1; x += 2) { + uint8 b0 = src_argb[0] >> 3; + uint8 g0 = src_argb[1] >> 2; + uint8 r0 = src_argb[2] >> 3; + uint8 b1 = src_argb[4] >> 3; + uint8 g1 = src_argb[5] >> 2; + uint8 r1 = src_argb[6] >> 3; + WRITEWORD(dst_rgb, b0 | (g0 << 5) | (r0 << 11) | + (b1 << 16) | (g1 << 21) | (r1 << 27)); + dst_rgb += 4; + src_argb += 8; + } + if (width & 1) { + uint8 b0 = src_argb[0] >> 3; + uint8 g0 = src_argb[1] >> 2; + uint8 r0 = src_argb[2] >> 3; + *(uint16*)(dst_rgb) = b0 | (g0 << 5) | (r0 << 11); + } +} + +void ARGBToARGB1555Row_C(const uint8* src_argb, uint8* dst_rgb, int width) { + int x; + for (x = 0; x < width - 1; x += 2) { + uint8 b0 = src_argb[0] >> 3; + uint8 g0 = src_argb[1] >> 3; + uint8 r0 = src_argb[2] >> 3; + uint8 a0 = src_argb[3] >> 7; + uint8 b1 = src_argb[4] >> 3; + uint8 g1 = src_argb[5] >> 3; + uint8 r1 = src_argb[6] >> 3; + uint8 a1 = src_argb[7] >> 7; + *(uint32*)(dst_rgb) = + b0 | (g0 << 5) | (r0 << 10) | (a0 << 15) | + (b1 << 16) | (g1 << 21) | (r1 << 26) | (a1 << 31); + dst_rgb += 4; + src_argb += 8; + } + if (width & 1) { + uint8 b0 = src_argb[0] >> 3; + uint8 g0 = src_argb[1] >> 3; + uint8 r0 = src_argb[2] >> 3; + uint8 a0 = src_argb[3] >> 7; + *(uint16*)(dst_rgb) = + b0 | (g0 << 5) | (r0 << 10) | (a0 << 15); + } +} + +void ARGBToARGB4444Row_C(const uint8* src_argb, uint8* dst_rgb, int width) { + int x; + for (x = 0; x < width - 1; x += 2) { + uint8 b0 = src_argb[0] >> 4; + uint8 g0 = src_argb[1] >> 4; + uint8 r0 = src_argb[2] >> 4; + uint8 a0 = src_argb[3] >> 4; + uint8 b1 = src_argb[4] >> 4; + uint8 g1 = src_argb[5] >> 4; + uint8 r1 = src_argb[6] >> 4; + uint8 a1 = src_argb[7] >> 4; + *(uint32*)(dst_rgb) = + b0 | (g0 << 4) | (r0 << 8) | (a0 << 12) | + (b1 << 16) | (g1 << 20) | (r1 << 24) | (a1 << 28); + dst_rgb += 4; + src_argb += 8; + } + if (width & 1) { + uint8 b0 = src_argb[0] >> 4; + uint8 g0 = src_argb[1] >> 4; + uint8 r0 = src_argb[2] >> 4; + uint8 a0 = src_argb[3] >> 4; + *(uint16*)(dst_rgb) = + b0 | (g0 << 4) | (r0 << 8) | (a0 << 12); + } +} + +static __inline int RGBToY(uint8 r, uint8 g, uint8 b) { + return (66 * r + 129 * g + 25 * b + 0x1080) >> 8; +} + +static __inline int RGBToU(uint8 r, uint8 g, uint8 b) { + return (112 * b - 74 * g - 38 * r + 0x8080) >> 8; +} +static __inline int RGBToV(uint8 r, uint8 g, uint8 b) { + return (112 * r - 94 * g - 18 * b + 0x8080) >> 8; +} + +#define MAKEROWY(NAME, R, G, B, BPP) \ +void NAME ## ToYRow_C(const uint8* src_argb0, uint8* dst_y, int width) { \ + int x; \ + for (x = 0; x < width; ++x) { \ + dst_y[0] = RGBToY(src_argb0[R], src_argb0[G], src_argb0[B]); \ + src_argb0 += BPP; \ + dst_y += 1; \ + } \ +} \ +void NAME ## ToUVRow_C(const uint8* src_rgb0, int src_stride_rgb, \ + uint8* dst_u, uint8* dst_v, int width) { \ + const uint8* src_rgb1 = src_rgb0 + src_stride_rgb; \ + int x; \ + for (x = 0; x < width - 1; x += 2) { \ + uint8 ab = (src_rgb0[B] + src_rgb0[B + BPP] + \ + src_rgb1[B] + src_rgb1[B + BPP]) >> 2; \ + uint8 ag = (src_rgb0[G] + src_rgb0[G + BPP] + \ + src_rgb1[G] + src_rgb1[G + BPP]) >> 2; \ + uint8 ar = (src_rgb0[R] + src_rgb0[R + BPP] + \ + src_rgb1[R] + src_rgb1[R + BPP]) >> 2; \ + dst_u[0] = RGBToU(ar, ag, ab); \ + dst_v[0] = RGBToV(ar, ag, ab); \ + src_rgb0 += BPP * 2; \ + src_rgb1 += BPP * 2; \ + dst_u += 1; \ + dst_v += 1; \ + } \ + if (width & 1) { \ + uint8 ab = (src_rgb0[B] + src_rgb1[B]) >> 1; \ + uint8 ag = (src_rgb0[G] + src_rgb1[G]) >> 1; \ + uint8 ar = (src_rgb0[R] + src_rgb1[R]) >> 1; \ + dst_u[0] = RGBToU(ar, ag, ab); \ + dst_v[0] = RGBToV(ar, ag, ab); \ + } \ +} + +MAKEROWY(ARGB, 2, 1, 0, 4) +MAKEROWY(BGRA, 1, 2, 3, 4) +MAKEROWY(ABGR, 0, 1, 2, 4) +MAKEROWY(RGBA, 3, 2, 1, 4) +MAKEROWY(RGB24, 2, 1, 0, 3) +MAKEROWY(RAW, 0, 1, 2, 3) +#undef MAKEROWY + +// JPeg uses a variation on BT.601-1 full range +// y = 0.29900 * r + 0.58700 * g + 0.11400 * b +// u = -0.16874 * r - 0.33126 * g + 0.50000 * b + center +// v = 0.50000 * r - 0.41869 * g - 0.08131 * b + center +// BT.601 Mpeg range uses: +// b 0.1016 * 255 = 25.908 = 25 +// g 0.5078 * 255 = 129.489 = 129 +// r 0.2578 * 255 = 65.739 = 66 +// JPeg 8 bit Y (not used): +// b 0.11400 * 256 = 29.184 = 29 +// g 0.58700 * 256 = 150.272 = 150 +// r 0.29900 * 256 = 76.544 = 77 +// JPeg 7 bit Y: +// b 0.11400 * 128 = 14.592 = 15 +// g 0.58700 * 128 = 75.136 = 75 +// r 0.29900 * 128 = 38.272 = 38 +// JPeg 8 bit U: +// b 0.50000 * 255 = 127.5 = 127 +// g -0.33126 * 255 = -84.4713 = -84 +// r -0.16874 * 255 = -43.0287 = -43 +// JPeg 8 bit V: +// b -0.08131 * 255 = -20.73405 = -20 +// g -0.41869 * 255 = -106.76595 = -107 +// r 0.50000 * 255 = 127.5 = 127 + +static __inline int RGBToYJ(uint8 r, uint8 g, uint8 b) { + return (38 * r + 75 * g + 15 * b + 64) >> 7; +} + +static __inline int RGBToUJ(uint8 r, uint8 g, uint8 b) { + return (127 * b - 84 * g - 43 * r + 0x8080) >> 8; +} +static __inline int RGBToVJ(uint8 r, uint8 g, uint8 b) { + return (127 * r - 107 * g - 20 * b + 0x8080) >> 8; +} + +#define AVGB(a, b) (((a) + (b) + 1) >> 1) + +#define MAKEROWYJ(NAME, R, G, B, BPP) \ +void NAME ## ToYJRow_C(const uint8* src_argb0, uint8* dst_y, int width) { \ + int x; \ + for (x = 0; x < width; ++x) { \ + dst_y[0] = RGBToYJ(src_argb0[R], src_argb0[G], src_argb0[B]); \ + src_argb0 += BPP; \ + dst_y += 1; \ + } \ +} \ +void NAME ## ToUVJRow_C(const uint8* src_rgb0, int src_stride_rgb, \ + uint8* dst_u, uint8* dst_v, int width) { \ + const uint8* src_rgb1 = src_rgb0 + src_stride_rgb; \ + int x; \ + for (x = 0; x < width - 1; x += 2) { \ + uint8 ab = AVGB(AVGB(src_rgb0[B], src_rgb1[B]), \ + AVGB(src_rgb0[B + BPP], src_rgb1[B + BPP])); \ + uint8 ag = AVGB(AVGB(src_rgb0[G], src_rgb1[G]), \ + AVGB(src_rgb0[G + BPP], src_rgb1[G + BPP])); \ + uint8 ar = AVGB(AVGB(src_rgb0[R], src_rgb1[R]), \ + AVGB(src_rgb0[R + BPP], src_rgb1[R + BPP])); \ + dst_u[0] = RGBToUJ(ar, ag, ab); \ + dst_v[0] = RGBToVJ(ar, ag, ab); \ + src_rgb0 += BPP * 2; \ + src_rgb1 += BPP * 2; \ + dst_u += 1; \ + dst_v += 1; \ + } \ + if (width & 1) { \ + uint8 ab = AVGB(src_rgb0[B], src_rgb1[B]); \ + uint8 ag = AVGB(src_rgb0[G], src_rgb1[G]); \ + uint8 ar = AVGB(src_rgb0[R], src_rgb1[R]); \ + dst_u[0] = RGBToUJ(ar, ag, ab); \ + dst_v[0] = RGBToVJ(ar, ag, ab); \ + } \ +} + +MAKEROWYJ(ARGB, 2, 1, 0, 4) +#undef MAKEROWYJ + +void RGB565ToYRow_C(const uint8* src_rgb565, uint8* dst_y, int width) { + int x; + for (x = 0; x < width; ++x) { + uint8 b = src_rgb565[0] & 0x1f; + uint8 g = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3); + uint8 r = src_rgb565[1] >> 3; + b = (b << 3) | (b >> 2); + g = (g << 2) | (g >> 4); + r = (r << 3) | (r >> 2); + dst_y[0] = RGBToY(r, g, b); + src_rgb565 += 2; + dst_y += 1; + } +} + +void ARGB1555ToYRow_C(const uint8* src_argb1555, uint8* dst_y, int width) { + int x; + for (x = 0; x < width; ++x) { + uint8 b = src_argb1555[0] & 0x1f; + uint8 g = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3); + uint8 r = (src_argb1555[1] & 0x7c) >> 2; + b = (b << 3) | (b >> 2); + g = (g << 3) | (g >> 2); + r = (r << 3) | (r >> 2); + dst_y[0] = RGBToY(r, g, b); + src_argb1555 += 2; + dst_y += 1; + } +} + +void ARGB4444ToYRow_C(const uint8* src_argb4444, uint8* dst_y, int width) { + int x; + for (x = 0; x < width; ++x) { + uint8 b = src_argb4444[0] & 0x0f; + uint8 g = src_argb4444[0] >> 4; + uint8 r = src_argb4444[1] & 0x0f; + b = (b << 4) | b; + g = (g << 4) | g; + r = (r << 4) | r; + dst_y[0] = RGBToY(r, g, b); + src_argb4444 += 2; + dst_y += 1; + } +} + +void RGB565ToUVRow_C(const uint8* src_rgb565, int src_stride_rgb565, + uint8* dst_u, uint8* dst_v, int width) { + const uint8* next_rgb565 = src_rgb565 + src_stride_rgb565; + int x; + for (x = 0; x < width - 1; x += 2) { + uint8 b0 = src_rgb565[0] & 0x1f; + uint8 g0 = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3); + uint8 r0 = src_rgb565[1] >> 3; + uint8 b1 = src_rgb565[2] & 0x1f; + uint8 g1 = (src_rgb565[2] >> 5) | ((src_rgb565[3] & 0x07) << 3); + uint8 r1 = src_rgb565[3] >> 3; + uint8 b2 = next_rgb565[0] & 0x1f; + uint8 g2 = (next_rgb565[0] >> 5) | ((next_rgb565[1] & 0x07) << 3); + uint8 r2 = next_rgb565[1] >> 3; + uint8 b3 = next_rgb565[2] & 0x1f; + uint8 g3 = (next_rgb565[2] >> 5) | ((next_rgb565[3] & 0x07) << 3); + uint8 r3 = next_rgb565[3] >> 3; + uint8 b = (b0 + b1 + b2 + b3); // 565 * 4 = 787. + uint8 g = (g0 + g1 + g2 + g3); + uint8 r = (r0 + r1 + r2 + r3); + b = (b << 1) | (b >> 6); // 787 -> 888. + r = (r << 1) | (r >> 6); + dst_u[0] = RGBToU(r, g, b); + dst_v[0] = RGBToV(r, g, b); + src_rgb565 += 4; + next_rgb565 += 4; + dst_u += 1; + dst_v += 1; + } + if (width & 1) { + uint8 b0 = src_rgb565[0] & 0x1f; + uint8 g0 = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3); + uint8 r0 = src_rgb565[1] >> 3; + uint8 b2 = next_rgb565[0] & 0x1f; + uint8 g2 = (next_rgb565[0] >> 5) | ((next_rgb565[1] & 0x07) << 3); + uint8 r2 = next_rgb565[1] >> 3; + uint8 b = (b0 + b2); // 565 * 2 = 676. + uint8 g = (g0 + g2); + uint8 r = (r0 + r2); + b = (b << 2) | (b >> 4); // 676 -> 888 + g = (g << 1) | (g >> 6); + r = (r << 2) | (r >> 4); + dst_u[0] = RGBToU(r, g, b); + dst_v[0] = RGBToV(r, g, b); + } +} + +void ARGB1555ToUVRow_C(const uint8* src_argb1555, int src_stride_argb1555, + uint8* dst_u, uint8* dst_v, int width) { + const uint8* next_argb1555 = src_argb1555 + src_stride_argb1555; + int x; + for (x = 0; x < width - 1; x += 2) { + uint8 b0 = src_argb1555[0] & 0x1f; + uint8 g0 = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3); + uint8 r0 = (src_argb1555[1] & 0x7c) >> 2; + uint8 b1 = src_argb1555[2] & 0x1f; + uint8 g1 = (src_argb1555[2] >> 5) | ((src_argb1555[3] & 0x03) << 3); + uint8 r1 = (src_argb1555[3] & 0x7c) >> 2; + uint8 b2 = next_argb1555[0] & 0x1f; + uint8 g2 = (next_argb1555[0] >> 5) | ((next_argb1555[1] & 0x03) << 3); + uint8 r2 = (next_argb1555[1] & 0x7c) >> 2; + uint8 b3 = next_argb1555[2] & 0x1f; + uint8 g3 = (next_argb1555[2] >> 5) | ((next_argb1555[3] & 0x03) << 3); + uint8 r3 = (next_argb1555[3] & 0x7c) >> 2; + uint8 b = (b0 + b1 + b2 + b3); // 555 * 4 = 777. + uint8 g = (g0 + g1 + g2 + g3); + uint8 r = (r0 + r1 + r2 + r3); + b = (b << 1) | (b >> 6); // 777 -> 888. + g = (g << 1) | (g >> 6); + r = (r << 1) | (r >> 6); + dst_u[0] = RGBToU(r, g, b); + dst_v[0] = RGBToV(r, g, b); + src_argb1555 += 4; + next_argb1555 += 4; + dst_u += 1; + dst_v += 1; + } + if (width & 1) { + uint8 b0 = src_argb1555[0] & 0x1f; + uint8 g0 = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3); + uint8 r0 = (src_argb1555[1] & 0x7c) >> 2; + uint8 b2 = next_argb1555[0] & 0x1f; + uint8 g2 = (next_argb1555[0] >> 5) | ((next_argb1555[1] & 0x03) << 3); + uint8 r2 = next_argb1555[1] >> 3; + uint8 b = (b0 + b2); // 555 * 2 = 666. + uint8 g = (g0 + g2); + uint8 r = (r0 + r2); + b = (b << 2) | (b >> 4); // 666 -> 888. + g = (g << 2) | (g >> 4); + r = (r << 2) | (r >> 4); + dst_u[0] = RGBToU(r, g, b); + dst_v[0] = RGBToV(r, g, b); + } +} + +void ARGB4444ToUVRow_C(const uint8* src_argb4444, int src_stride_argb4444, + uint8* dst_u, uint8* dst_v, int width) { + const uint8* next_argb4444 = src_argb4444 + src_stride_argb4444; + int x; + for (x = 0; x < width - 1; x += 2) { + uint8 b0 = src_argb4444[0] & 0x0f; + uint8 g0 = src_argb4444[0] >> 4; + uint8 r0 = src_argb4444[1] & 0x0f; + uint8 b1 = src_argb4444[2] & 0x0f; + uint8 g1 = src_argb4444[2] >> 4; + uint8 r1 = src_argb4444[3] & 0x0f; + uint8 b2 = next_argb4444[0] & 0x0f; + uint8 g2 = next_argb4444[0] >> 4; + uint8 r2 = next_argb4444[1] & 0x0f; + uint8 b3 = next_argb4444[2] & 0x0f; + uint8 g3 = next_argb4444[2] >> 4; + uint8 r3 = next_argb4444[3] & 0x0f; + uint8 b = (b0 + b1 + b2 + b3); // 444 * 4 = 666. + uint8 g = (g0 + g1 + g2 + g3); + uint8 r = (r0 + r1 + r2 + r3); + b = (b << 2) | (b >> 4); // 666 -> 888. + g = (g << 2) | (g >> 4); + r = (r << 2) | (r >> 4); + dst_u[0] = RGBToU(r, g, b); + dst_v[0] = RGBToV(r, g, b); + src_argb4444 += 4; + next_argb4444 += 4; + dst_u += 1; + dst_v += 1; + } + if (width & 1) { + uint8 b0 = src_argb4444[0] & 0x0f; + uint8 g0 = src_argb4444[0] >> 4; + uint8 r0 = src_argb4444[1] & 0x0f; + uint8 b2 = next_argb4444[0] & 0x0f; + uint8 g2 = next_argb4444[0] >> 4; + uint8 r2 = next_argb4444[1] & 0x0f; + uint8 b = (b0 + b2); // 444 * 2 = 555. + uint8 g = (g0 + g2); + uint8 r = (r0 + r2); + b = (b << 3) | (b >> 2); // 555 -> 888. + g = (g << 3) | (g >> 2); + r = (r << 3) | (r >> 2); + dst_u[0] = RGBToU(r, g, b); + dst_v[0] = RGBToV(r, g, b); + } +} + +void ARGBToUV444Row_C(const uint8* src_argb, + uint8* dst_u, uint8* dst_v, int width) { + int x; + for (x = 0; x < width; ++x) { + uint8 ab = src_argb[0]; + uint8 ag = src_argb[1]; + uint8 ar = src_argb[2]; + dst_u[0] = RGBToU(ar, ag, ab); + dst_v[0] = RGBToV(ar, ag, ab); + src_argb += 4; + dst_u += 1; + dst_v += 1; + } +} + +void ARGBToUV422Row_C(const uint8* src_argb, + uint8* dst_u, uint8* dst_v, int width) { + int x; + for (x = 0; x < width - 1; x += 2) { + uint8 ab = (src_argb[0] + src_argb[4]) >> 1; + uint8 ag = (src_argb[1] + src_argb[5]) >> 1; + uint8 ar = (src_argb[2] + src_argb[6]) >> 1; + dst_u[0] = RGBToU(ar, ag, ab); + dst_v[0] = RGBToV(ar, ag, ab); + src_argb += 8; + dst_u += 1; + dst_v += 1; + } + if (width & 1) { + uint8 ab = src_argb[0]; + uint8 ag = src_argb[1]; + uint8 ar = src_argb[2]; + dst_u[0] = RGBToU(ar, ag, ab); + dst_v[0] = RGBToV(ar, ag, ab); + } +} + +void ARGBToUV411Row_C(const uint8* src_argb, + uint8* dst_u, uint8* dst_v, int width) { + int x; + for (x = 0; x < width - 3; x += 4) { + uint8 ab = (src_argb[0] + src_argb[4] + src_argb[8] + src_argb[12]) >> 2; + uint8 ag = (src_argb[1] + src_argb[5] + src_argb[9] + src_argb[13]) >> 2; + uint8 ar = (src_argb[2] + src_argb[6] + src_argb[10] + src_argb[14]) >> 2; + dst_u[0] = RGBToU(ar, ag, ab); + dst_v[0] = RGBToV(ar, ag, ab); + src_argb += 16; + dst_u += 1; + dst_v += 1; + } + if ((width & 3) == 3) { + uint8 ab = (src_argb[0] + src_argb[4] + src_argb[8]) / 3; + uint8 ag = (src_argb[1] + src_argb[5] + src_argb[9]) / 3; + uint8 ar = (src_argb[2] + src_argb[6] + src_argb[10]) / 3; + dst_u[0] = RGBToU(ar, ag, ab); + dst_v[0] = RGBToV(ar, ag, ab); + } else if ((width & 3) == 2) { + uint8 ab = (src_argb[0] + src_argb[4]) >> 1; + uint8 ag = (src_argb[1] + src_argb[5]) >> 1; + uint8 ar = (src_argb[2] + src_argb[6]) >> 1; + dst_u[0] = RGBToU(ar, ag, ab); + dst_v[0] = RGBToV(ar, ag, ab); + } else if ((width & 3) == 1) { + uint8 ab = src_argb[0]; + uint8 ag = src_argb[1]; + uint8 ar = src_argb[2]; + dst_u[0] = RGBToU(ar, ag, ab); + dst_v[0] = RGBToV(ar, ag, ab); + } +} + +void ARGBGrayRow_C(const uint8* src_argb, uint8* dst_argb, int width) { + int x; + for (x = 0; x < width; ++x) { + uint8 y = RGBToYJ(src_argb[2], src_argb[1], src_argb[0]); + dst_argb[2] = dst_argb[1] = dst_argb[0] = y; + dst_argb[3] = src_argb[3]; + dst_argb += 4; + src_argb += 4; + } +} + +// Convert a row of image to Sepia tone. +void ARGBSepiaRow_C(uint8* dst_argb, int width) { + int x; + for (x = 0; x < width; ++x) { + int b = dst_argb[0]; + int g = dst_argb[1]; + int r = dst_argb[2]; + int sb = (b * 17 + g * 68 + r * 35) >> 7; + int sg = (b * 22 + g * 88 + r * 45) >> 7; + int sr = (b * 24 + g * 98 + r * 50) >> 7; + // b does not over flow. a is preserved from original. + dst_argb[0] = sb; + dst_argb[1] = clamp255(sg); + dst_argb[2] = clamp255(sr); + dst_argb += 4; + } +} + +// Apply color matrix to a row of image. Matrix is signed. +// TODO(fbarchard): Consider adding rounding (+32). +void ARGBColorMatrixRow_C(const uint8* src_argb, uint8* dst_argb, + const int8* matrix_argb, int width) { + int x; + for (x = 0; x < width; ++x) { + int b = src_argb[0]; + int g = src_argb[1]; + int r = src_argb[2]; + int a = src_argb[3]; + int sb = (b * matrix_argb[0] + g * matrix_argb[1] + + r * matrix_argb[2] + a * matrix_argb[3]) >> 6; + int sg = (b * matrix_argb[4] + g * matrix_argb[5] + + r * matrix_argb[6] + a * matrix_argb[7]) >> 6; + int sr = (b * matrix_argb[8] + g * matrix_argb[9] + + r * matrix_argb[10] + a * matrix_argb[11]) >> 6; + int sa = (b * matrix_argb[12] + g * matrix_argb[13] + + r * matrix_argb[14] + a * matrix_argb[15]) >> 6; + dst_argb[0] = Clamp(sb); + dst_argb[1] = Clamp(sg); + dst_argb[2] = Clamp(sr); + dst_argb[3] = Clamp(sa); + src_argb += 4; + dst_argb += 4; + } +} + +// Apply color table to a row of image. +void ARGBColorTableRow_C(uint8* dst_argb, const uint8* table_argb, int width) { + int x; + for (x = 0; x < width; ++x) { + int b = dst_argb[0]; + int g = dst_argb[1]; + int r = dst_argb[2]; + int a = dst_argb[3]; + dst_argb[0] = table_argb[b * 4 + 0]; + dst_argb[1] = table_argb[g * 4 + 1]; + dst_argb[2] = table_argb[r * 4 + 2]; + dst_argb[3] = table_argb[a * 4 + 3]; + dst_argb += 4; + } +} + +// Apply color table to a row of image. +void RGBColorTableRow_C(uint8* dst_argb, const uint8* table_argb, int width) { + int x; + for (x = 0; x < width; ++x) { + int b = dst_argb[0]; + int g = dst_argb[1]; + int r = dst_argb[2]; + dst_argb[0] = table_argb[b * 4 + 0]; + dst_argb[1] = table_argb[g * 4 + 1]; + dst_argb[2] = table_argb[r * 4 + 2]; + dst_argb += 4; + } +} + +void ARGBQuantizeRow_C(uint8* dst_argb, int scale, int interval_size, + int interval_offset, int width) { + int x; + for (x = 0; x < width; ++x) { + int b = dst_argb[0]; + int g = dst_argb[1]; + int r = dst_argb[2]; + dst_argb[0] = (b * scale >> 16) * interval_size + interval_offset; + dst_argb[1] = (g * scale >> 16) * interval_size + interval_offset; + dst_argb[2] = (r * scale >> 16) * interval_size + interval_offset; + dst_argb += 4; + } +} + +#define REPEAT8(v) (v) | ((v) << 8) +#define SHADE(f, v) v * f >> 24 + +void ARGBShadeRow_C(const uint8* src_argb, uint8* dst_argb, int width, + uint32 value) { + const uint32 b_scale = REPEAT8(value & 0xff); + const uint32 g_scale = REPEAT8((value >> 8) & 0xff); + const uint32 r_scale = REPEAT8((value >> 16) & 0xff); + const uint32 a_scale = REPEAT8(value >> 24); + + int i; + for (i = 0; i < width; ++i) { + const uint32 b = REPEAT8(src_argb[0]); + const uint32 g = REPEAT8(src_argb[1]); + const uint32 r = REPEAT8(src_argb[2]); + const uint32 a = REPEAT8(src_argb[3]); + dst_argb[0] = SHADE(b, b_scale); + dst_argb[1] = SHADE(g, g_scale); + dst_argb[2] = SHADE(r, r_scale); + dst_argb[3] = SHADE(a, a_scale); + src_argb += 4; + dst_argb += 4; + } +} +#undef REPEAT8 +#undef SHADE + +#define REPEAT8(v) (v) | ((v) << 8) +#define SHADE(f, v) v * f >> 16 + +void ARGBMultiplyRow_C(const uint8* src_argb0, const uint8* src_argb1, + uint8* dst_argb, int width) { + int i; + for (i = 0; i < width; ++i) { + const uint32 b = REPEAT8(src_argb0[0]); + const uint32 g = REPEAT8(src_argb0[1]); + const uint32 r = REPEAT8(src_argb0[2]); + const uint32 a = REPEAT8(src_argb0[3]); + const uint32 b_scale = src_argb1[0]; + const uint32 g_scale = src_argb1[1]; + const uint32 r_scale = src_argb1[2]; + const uint32 a_scale = src_argb1[3]; + dst_argb[0] = SHADE(b, b_scale); + dst_argb[1] = SHADE(g, g_scale); + dst_argb[2] = SHADE(r, r_scale); + dst_argb[3] = SHADE(a, a_scale); + src_argb0 += 4; + src_argb1 += 4; + dst_argb += 4; + } +} +#undef REPEAT8 +#undef SHADE + +#define SHADE(f, v) clamp255(v + f) + +void ARGBAddRow_C(const uint8* src_argb0, const uint8* src_argb1, + uint8* dst_argb, int width) { + int i; + for (i = 0; i < width; ++i) { + const int b = src_argb0[0]; + const int g = src_argb0[1]; + const int r = src_argb0[2]; + const int a = src_argb0[3]; + const int b_add = src_argb1[0]; + const int g_add = src_argb1[1]; + const int r_add = src_argb1[2]; + const int a_add = src_argb1[3]; + dst_argb[0] = SHADE(b, b_add); + dst_argb[1] = SHADE(g, g_add); + dst_argb[2] = SHADE(r, r_add); + dst_argb[3] = SHADE(a, a_add); + src_argb0 += 4; + src_argb1 += 4; + dst_argb += 4; + } +} +#undef SHADE + +#define SHADE(f, v) clamp0(f - v) + +void ARGBSubtractRow_C(const uint8* src_argb0, const uint8* src_argb1, + uint8* dst_argb, int width) { + int i; + for (i = 0; i < width; ++i) { + const int b = src_argb0[0]; + const int g = src_argb0[1]; + const int r = src_argb0[2]; + const int a = src_argb0[3]; + const int b_sub = src_argb1[0]; + const int g_sub = src_argb1[1]; + const int r_sub = src_argb1[2]; + const int a_sub = src_argb1[3]; + dst_argb[0] = SHADE(b, b_sub); + dst_argb[1] = SHADE(g, g_sub); + dst_argb[2] = SHADE(r, r_sub); + dst_argb[3] = SHADE(a, a_sub); + src_argb0 += 4; + src_argb1 += 4; + dst_argb += 4; + } +} +#undef SHADE + +// Sobel functions which mimics SSSE3. +void SobelXRow_C(const uint8* src_y0, const uint8* src_y1, const uint8* src_y2, + uint8* dst_sobelx, int width) { + int i; + for (i = 0; i < width; ++i) { + int a = src_y0[i]; + int b = src_y1[i]; + int c = src_y2[i]; + int a_sub = src_y0[i + 2]; + int b_sub = src_y1[i + 2]; + int c_sub = src_y2[i + 2]; + int a_diff = a - a_sub; + int b_diff = b - b_sub; + int c_diff = c - c_sub; + int sobel = Abs(a_diff + b_diff * 2 + c_diff); + dst_sobelx[i] = (uint8)(clamp255(sobel)); + } +} + +void SobelYRow_C(const uint8* src_y0, const uint8* src_y1, + uint8* dst_sobely, int width) { + int i; + for (i = 0; i < width; ++i) { + int a = src_y0[i + 0]; + int b = src_y0[i + 1]; + int c = src_y0[i + 2]; + int a_sub = src_y1[i + 0]; + int b_sub = src_y1[i + 1]; + int c_sub = src_y1[i + 2]; + int a_diff = a - a_sub; + int b_diff = b - b_sub; + int c_diff = c - c_sub; + int sobel = Abs(a_diff + b_diff * 2 + c_diff); + dst_sobely[i] = (uint8)(clamp255(sobel)); + } +} + +void SobelRow_C(const uint8* src_sobelx, const uint8* src_sobely, + uint8* dst_argb, int width) { + int i; + for (i = 0; i < width; ++i) { + int r = src_sobelx[i]; + int b = src_sobely[i]; + int s = clamp255(r + b); + dst_argb[0] = (uint8)(s); + dst_argb[1] = (uint8)(s); + dst_argb[2] = (uint8)(s); + dst_argb[3] = (uint8)(255u); + dst_argb += 4; + } +} + +void SobelToPlaneRow_C(const uint8* src_sobelx, const uint8* src_sobely, + uint8* dst_y, int width) { + int i; + for (i = 0; i < width; ++i) { + int r = src_sobelx[i]; + int b = src_sobely[i]; + int s = clamp255(r + b); + dst_y[i] = (uint8)(s); + } +} + +void SobelXYRow_C(const uint8* src_sobelx, const uint8* src_sobely, + uint8* dst_argb, int width) { + int i; + for (i = 0; i < width; ++i) { + int r = src_sobelx[i]; + int b = src_sobely[i]; + int g = clamp255(r + b); + dst_argb[0] = (uint8)(b); + dst_argb[1] = (uint8)(g); + dst_argb[2] = (uint8)(r); + dst_argb[3] = (uint8)(255u); + dst_argb += 4; + } +} + +void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int width) { + // Copy a Y to RGB. + int x; + for (x = 0; x < width; ++x) { + uint8 y = src_y[0]; + dst_argb[2] = dst_argb[1] = dst_argb[0] = y; + dst_argb[3] = 255u; + dst_argb += 4; + ++src_y; + } +} + +// C reference code that mimics the YUV assembly. + +#define YG 74 /* (int8)(1.164 * 64 + 0.5) */ + +#define UB 127 /* min(63,(int8)(2.018 * 64)) */ +#define UG -25 /* (int8)(-0.391 * 64 - 0.5) */ +#define UR 0 + +#define VB 0 +#define VG -52 /* (int8)(-0.813 * 64 - 0.5) */ +#define VR 102 /* (int8)(1.596 * 64 + 0.5) */ + +// Bias +#define BB UB * 128 + VB * 128 +#define BG UG * 128 + VG * 128 +#define BR UR * 128 + VR * 128 + +static __inline void YuvPixel(uint8 y, uint8 u, uint8 v, + uint8* b, uint8* g, uint8* r) { + int32 y1 = ((int32)(y) - 16) * YG; + *b = Clamp((int32)((u * UB + v * VB) - (BB) + y1) >> 6); + *g = Clamp((int32)((u * UG + v * VG) - (BG) + y1) >> 6); + *r = Clamp((int32)((u * UR + v * VR) - (BR) + y1) >> 6); +} + +#if !defined(LIBYUV_DISABLE_NEON) && \ + (defined(__ARM_NEON__) || defined(LIBYUV_NEON)) +// C mimic assembly. +// TODO(fbarchard): Remove subsampling from Neon. +void I444ToARGBRow_C(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* rgb_buf, + int width) { + int x; + for (x = 0; x < width - 1; x += 2) { + uint8 u = (src_u[0] + src_u[1] + 1) >> 1; + uint8 v = (src_v[0] + src_v[1] + 1) >> 1; + YuvPixel(src_y[0], u, v, rgb_buf + 0, rgb_buf + 1, rgb_buf + 2); + rgb_buf[3] = 255; + YuvPixel(src_y[1], u, v, rgb_buf + 4, rgb_buf + 5, rgb_buf + 6); + rgb_buf[7] = 255; + src_y += 2; + src_u += 2; + src_v += 2; + rgb_buf += 8; // Advance 2 pixels. + } + if (width & 1) { + YuvPixel(src_y[0], src_u[0], src_v[0], + rgb_buf + 0, rgb_buf + 1, rgb_buf + 2); + } +} +#else +void I444ToARGBRow_C(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* rgb_buf, + int width) { + int x; + for (x = 0; x < width; ++x) { + YuvPixel(src_y[0], src_u[0], src_v[0], + rgb_buf + 0, rgb_buf + 1, rgb_buf + 2); + rgb_buf[3] = 255; + src_y += 1; + src_u += 1; + src_v += 1; + rgb_buf += 4; // Advance 1 pixel. + } +} +#endif +// Also used for 420 +void I422ToARGBRow_C(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* rgb_buf, + int width) { + int x; + for (x = 0; x < width - 1; x += 2) { + YuvPixel(src_y[0], src_u[0], src_v[0], + rgb_buf + 0, rgb_buf + 1, rgb_buf + 2); + rgb_buf[3] = 255; + YuvPixel(src_y[1], src_u[0], src_v[0], + rgb_buf + 4, rgb_buf + 5, rgb_buf + 6); + rgb_buf[7] = 255; + src_y += 2; + src_u += 1; + src_v += 1; + rgb_buf += 8; // Advance 2 pixels. + } + if (width & 1) { + YuvPixel(src_y[0], src_u[0], src_v[0], + rgb_buf + 0, rgb_buf + 1, rgb_buf + 2); + rgb_buf[3] = 255; + } +} + +void I422ToRGB24Row_C(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* rgb_buf, + int width) { + int x; + for (x = 0; x < width - 1; x += 2) { + YuvPixel(src_y[0], src_u[0], src_v[0], + rgb_buf + 0, rgb_buf + 1, rgb_buf + 2); + YuvPixel(src_y[1], src_u[0], src_v[0], + rgb_buf + 3, rgb_buf + 4, rgb_buf + 5); + src_y += 2; + src_u += 1; + src_v += 1; + rgb_buf += 6; // Advance 2 pixels. + } + if (width & 1) { + YuvPixel(src_y[0], src_u[0], src_v[0], + rgb_buf + 0, rgb_buf + 1, rgb_buf + 2); + } +} + +void I422ToRAWRow_C(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* rgb_buf, + int width) { + int x; + for (x = 0; x < width - 1; x += 2) { + YuvPixel(src_y[0], src_u[0], src_v[0], + rgb_buf + 2, rgb_buf + 1, rgb_buf + 0); + YuvPixel(src_y[1], src_u[0], src_v[0], + rgb_buf + 5, rgb_buf + 4, rgb_buf + 3); + src_y += 2; + src_u += 1; + src_v += 1; + rgb_buf += 6; // Advance 2 pixels. + } + if (width & 1) { + YuvPixel(src_y[0], src_u[0], src_v[0], + rgb_buf + 2, rgb_buf + 1, rgb_buf + 0); + } +} + +void I422ToARGB4444Row_C(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb4444, + int width) { + uint8 b0; + uint8 g0; + uint8 r0; + uint8 b1; + uint8 g1; + uint8 r1; + int x; + for (x = 0; x < width - 1; x += 2) { + YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0); + YuvPixel(src_y[1], src_u[0], src_v[0], &b1, &g1, &r1); + b0 = b0 >> 4; + g0 = g0 >> 4; + r0 = r0 >> 4; + b1 = b1 >> 4; + g1 = g1 >> 4; + r1 = r1 >> 4; + *(uint32*)(dst_argb4444) = b0 | (g0 << 4) | (r0 << 8) | + (b1 << 16) | (g1 << 20) | (r1 << 24) | 0xf000f000; + src_y += 2; + src_u += 1; + src_v += 1; + dst_argb4444 += 4; // Advance 2 pixels. + } + if (width & 1) { + YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0); + b0 = b0 >> 4; + g0 = g0 >> 4; + r0 = r0 >> 4; + *(uint16*)(dst_argb4444) = b0 | (g0 << 4) | (r0 << 8) | + 0xf000; + } +} + +void I422ToARGB1555Row_C(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb1555, + int width) { + uint8 b0; + uint8 g0; + uint8 r0; + uint8 b1; + uint8 g1; + uint8 r1; + int x; + for (x = 0; x < width - 1; x += 2) { + YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0); + YuvPixel(src_y[1], src_u[0], src_v[0], &b1, &g1, &r1); + b0 = b0 >> 3; + g0 = g0 >> 3; + r0 = r0 >> 3; + b1 = b1 >> 3; + g1 = g1 >> 3; + r1 = r1 >> 3; + *(uint32*)(dst_argb1555) = b0 | (g0 << 5) | (r0 << 10) | + (b1 << 16) | (g1 << 21) | (r1 << 26) | 0x80008000; + src_y += 2; + src_u += 1; + src_v += 1; + dst_argb1555 += 4; // Advance 2 pixels. + } + if (width & 1) { + YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0); + b0 = b0 >> 3; + g0 = g0 >> 3; + r0 = r0 >> 3; + *(uint16*)(dst_argb1555) = b0 | (g0 << 5) | (r0 << 10) | + 0x8000; + } +} + +void I422ToRGB565Row_C(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_rgb565, + int width) { + uint8 b0; + uint8 g0; + uint8 r0; + uint8 b1; + uint8 g1; + uint8 r1; + int x; + for (x = 0; x < width - 1; x += 2) { + YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0); + YuvPixel(src_y[1], src_u[0], src_v[0], &b1, &g1, &r1); + b0 = b0 >> 3; + g0 = g0 >> 2; + r0 = r0 >> 3; + b1 = b1 >> 3; + g1 = g1 >> 2; + r1 = r1 >> 3; + *(uint32*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11) | + (b1 << 16) | (g1 << 21) | (r1 << 27); + src_y += 2; + src_u += 1; + src_v += 1; + dst_rgb565 += 4; // Advance 2 pixels. + } + if (width & 1) { + YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0); + b0 = b0 >> 3; + g0 = g0 >> 2; + r0 = r0 >> 3; + *(uint16*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11); + } +} + +void I411ToARGBRow_C(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* rgb_buf, + int width) { + int x; + for (x = 0; x < width - 3; x += 4) { + YuvPixel(src_y[0], src_u[0], src_v[0], + rgb_buf + 0, rgb_buf + 1, rgb_buf + 2); + rgb_buf[3] = 255; + YuvPixel(src_y[1], src_u[0], src_v[0], + rgb_buf + 4, rgb_buf + 5, rgb_buf + 6); + rgb_buf[7] = 255; + YuvPixel(src_y[2], src_u[0], src_v[0], + rgb_buf + 8, rgb_buf + 9, rgb_buf + 10); + rgb_buf[11] = 255; + YuvPixel(src_y[3], src_u[0], src_v[0], + rgb_buf + 12, rgb_buf + 13, rgb_buf + 14); + rgb_buf[15] = 255; + src_y += 4; + src_u += 1; + src_v += 1; + rgb_buf += 16; // Advance 4 pixels. + } + if (width & 2) { + YuvPixel(src_y[0], src_u[0], src_v[0], + rgb_buf + 0, rgb_buf + 1, rgb_buf + 2); + rgb_buf[3] = 255; + YuvPixel(src_y[1], src_u[0], src_v[0], + rgb_buf + 4, rgb_buf + 5, rgb_buf + 6); + rgb_buf[7] = 255; + src_y += 2; + rgb_buf += 8; // Advance 2 pixels. + } + if (width & 1) { + YuvPixel(src_y[0], src_u[0], src_v[0], + rgb_buf + 0, rgb_buf + 1, rgb_buf + 2); + rgb_buf[3] = 255; + } +} + +void NV12ToARGBRow_C(const uint8* src_y, + const uint8* usrc_v, + uint8* rgb_buf, + int width) { + int x; + for (x = 0; x < width - 1; x += 2) { + YuvPixel(src_y[0], usrc_v[0], usrc_v[1], + rgb_buf + 0, rgb_buf + 1, rgb_buf + 2); + rgb_buf[3] = 255; + YuvPixel(src_y[1], usrc_v[0], usrc_v[1], + rgb_buf + 4, rgb_buf + 5, rgb_buf + 6); + rgb_buf[7] = 255; + src_y += 2; + usrc_v += 2; + rgb_buf += 8; // Advance 2 pixels. + } + if (width & 1) { + YuvPixel(src_y[0], usrc_v[0], usrc_v[1], + rgb_buf + 0, rgb_buf + 1, rgb_buf + 2); + rgb_buf[3] = 255; + } +} + +void NV21ToARGBRow_C(const uint8* src_y, + const uint8* src_vu, + uint8* rgb_buf, + int width) { + int x; + for (x = 0; x < width - 1; x += 2) { + YuvPixel(src_y[0], src_vu[1], src_vu[0], + rgb_buf + 0, rgb_buf + 1, rgb_buf + 2); + rgb_buf[3] = 255; + + YuvPixel(src_y[1], src_vu[1], src_vu[0], + rgb_buf + 4, rgb_buf + 5, rgb_buf + 6); + rgb_buf[7] = 255; + + src_y += 2; + src_vu += 2; + rgb_buf += 8; // Advance 2 pixels. + } + if (width & 1) { + YuvPixel(src_y[0], src_vu[1], src_vu[0], + rgb_buf + 0, rgb_buf + 1, rgb_buf + 2); + rgb_buf[3] = 255; + } +} + +void NV12ToRGB565Row_C(const uint8* src_y, + const uint8* usrc_v, + uint8* dst_rgb565, + int width) { + uint8 b0; + uint8 g0; + uint8 r0; + uint8 b1; + uint8 g1; + uint8 r1; + int x; + for (x = 0; x < width - 1; x += 2) { + YuvPixel(src_y[0], usrc_v[0], usrc_v[1], &b0, &g0, &r0); + YuvPixel(src_y[1], usrc_v[0], usrc_v[1], &b1, &g1, &r1); + b0 = b0 >> 3; + g0 = g0 >> 2; + r0 = r0 >> 3; + b1 = b1 >> 3; + g1 = g1 >> 2; + r1 = r1 >> 3; + *(uint32*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11) | + (b1 << 16) | (g1 << 21) | (r1 << 27); + src_y += 2; + usrc_v += 2; + dst_rgb565 += 4; // Advance 2 pixels. + } + if (width & 1) { + YuvPixel(src_y[0], usrc_v[0], usrc_v[1], &b0, &g0, &r0); + b0 = b0 >> 3; + g0 = g0 >> 2; + r0 = r0 >> 3; + *(uint16*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11); + } +} + +void NV21ToRGB565Row_C(const uint8* src_y, + const uint8* vsrc_u, + uint8* dst_rgb565, + int width) { + uint8 b0; + uint8 g0; + uint8 r0; + uint8 b1; + uint8 g1; + uint8 r1; + int x; + for (x = 0; x < width - 1; x += 2) { + YuvPixel(src_y[0], vsrc_u[1], vsrc_u[0], &b0, &g0, &r0); + YuvPixel(src_y[1], vsrc_u[1], vsrc_u[0], &b1, &g1, &r1); + b0 = b0 >> 3; + g0 = g0 >> 2; + r0 = r0 >> 3; + b1 = b1 >> 3; + g1 = g1 >> 2; + r1 = r1 >> 3; + *(uint32*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11) | + (b1 << 16) | (g1 << 21) | (r1 << 27); + src_y += 2; + vsrc_u += 2; + dst_rgb565 += 4; // Advance 2 pixels. + } + if (width & 1) { + YuvPixel(src_y[0], vsrc_u[1], vsrc_u[0], &b0, &g0, &r0); + b0 = b0 >> 3; + g0 = g0 >> 2; + r0 = r0 >> 3; + *(uint16*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11); + } +} + +void YUY2ToARGBRow_C(const uint8* src_yuy2, + uint8* rgb_buf, + int width) { + int x; + for (x = 0; x < width - 1; x += 2) { + YuvPixel(src_yuy2[0], src_yuy2[1], src_yuy2[3], + rgb_buf + 0, rgb_buf + 1, rgb_buf + 2); + rgb_buf[3] = 255; + YuvPixel(src_yuy2[2], src_yuy2[1], src_yuy2[3], + rgb_buf + 4, rgb_buf + 5, rgb_buf + 6); + rgb_buf[7] = 255; + src_yuy2 += 4; + rgb_buf += 8; // Advance 2 pixels. + } + if (width & 1) { + YuvPixel(src_yuy2[0], src_yuy2[1], src_yuy2[3], + rgb_buf + 0, rgb_buf + 1, rgb_buf + 2); + rgb_buf[3] = 255; + } +} + +void UYVYToARGBRow_C(const uint8* src_uyvy, + uint8* rgb_buf, + int width) { + int x; + for (x = 0; x < width - 1; x += 2) { + YuvPixel(src_uyvy[1], src_uyvy[0], src_uyvy[2], + rgb_buf + 0, rgb_buf + 1, rgb_buf + 2); + rgb_buf[3] = 255; + YuvPixel(src_uyvy[3], src_uyvy[0], src_uyvy[2], + rgb_buf + 4, rgb_buf + 5, rgb_buf + 6); + rgb_buf[7] = 255; + src_uyvy += 4; + rgb_buf += 8; // Advance 2 pixels. + } + if (width & 1) { + YuvPixel(src_uyvy[1], src_uyvy[0], src_uyvy[2], + rgb_buf + 0, rgb_buf + 1, rgb_buf + 2); + rgb_buf[3] = 255; + } +} + +void I422ToBGRARow_C(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* rgb_buf, + int width) { + int x; + for (x = 0; x < width - 1; x += 2) { + YuvPixel(src_y[0], src_u[0], src_v[0], + rgb_buf + 3, rgb_buf + 2, rgb_buf + 1); + rgb_buf[0] = 255; + YuvPixel(src_y[1], src_u[0], src_v[0], + rgb_buf + 7, rgb_buf + 6, rgb_buf + 5); + rgb_buf[4] = 255; + src_y += 2; + src_u += 1; + src_v += 1; + rgb_buf += 8; // Advance 2 pixels. + } + if (width & 1) { + YuvPixel(src_y[0], src_u[0], src_v[0], + rgb_buf + 3, rgb_buf + 2, rgb_buf + 1); + rgb_buf[0] = 255; + } +} + +void I422ToABGRRow_C(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* rgb_buf, + int width) { + int x; + for (x = 0; x < width - 1; x += 2) { + YuvPixel(src_y[0], src_u[0], src_v[0], + rgb_buf + 2, rgb_buf + 1, rgb_buf + 0); + rgb_buf[3] = 255; + YuvPixel(src_y[1], src_u[0], src_v[0], + rgb_buf + 6, rgb_buf + 5, rgb_buf + 4); + rgb_buf[7] = 255; + src_y += 2; + src_u += 1; + src_v += 1; + rgb_buf += 8; // Advance 2 pixels. + } + if (width & 1) { + YuvPixel(src_y[0], src_u[0], src_v[0], + rgb_buf + 2, rgb_buf + 1, rgb_buf + 0); + rgb_buf[3] = 255; + } +} + +void I422ToRGBARow_C(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* rgb_buf, + int width) { + int x; + for (x = 0; x < width - 1; x += 2) { + YuvPixel(src_y[0], src_u[0], src_v[0], + rgb_buf + 1, rgb_buf + 2, rgb_buf + 3); + rgb_buf[0] = 255; + YuvPixel(src_y[1], src_u[0], src_v[0], + rgb_buf + 5, rgb_buf + 6, rgb_buf + 7); + rgb_buf[4] = 255; + src_y += 2; + src_u += 1; + src_v += 1; + rgb_buf += 8; // Advance 2 pixels. + } + if (width & 1) { + YuvPixel(src_y[0], src_u[0], src_v[0], + rgb_buf + 1, rgb_buf + 2, rgb_buf + 3); + rgb_buf[0] = 255; + } +} + +void YToARGBRow_C(const uint8* src_y, uint8* rgb_buf, int width) { + int x; + for (x = 0; x < width - 1; x += 2) { + YuvPixel(src_y[0], 128, 128, + rgb_buf + 0, rgb_buf + 1, rgb_buf + 2); + rgb_buf[3] = 255; + YuvPixel(src_y[1], 128, 128, + rgb_buf + 4, rgb_buf + 5, rgb_buf + 6); + rgb_buf[7] = 255; + src_y += 2; + rgb_buf += 8; // Advance 2 pixels. + } + if (width & 1) { + YuvPixel(src_y[0], 128, 128, + rgb_buf + 0, rgb_buf + 1, rgb_buf + 2); + rgb_buf[3] = 255; + } +} + +void MirrorRow_C(const uint8* src, uint8* dst, int width) { + int x; + src += width - 1; + for (x = 0; x < width - 1; x += 2) { + dst[x] = src[0]; + dst[x + 1] = src[-1]; + src -= 2; + } + if (width & 1) { + dst[width - 1] = src[0]; + } +} + +void MirrorUVRow_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) { + int x; + src_uv += (width - 1) << 1; + for (x = 0; x < width - 1; x += 2) { + dst_u[x] = src_uv[0]; + dst_u[x + 1] = src_uv[-2]; + dst_v[x] = src_uv[1]; + dst_v[x + 1] = src_uv[-2 + 1]; + src_uv -= 4; + } + if (width & 1) { + dst_u[width - 1] = src_uv[0]; + dst_v[width - 1] = src_uv[1]; + } +} + +void ARGBMirrorRow_C(const uint8* src, uint8* dst, int width) { + int x; + const uint32* src32 = (const uint32*)(src); + uint32* dst32 = (uint32*)(dst); + src32 += width - 1; + for (x = 0; x < width - 1; x += 2) { + dst32[x] = src32[0]; + dst32[x + 1] = src32[-1]; + src32 -= 2; + } + if (width & 1) { + dst32[width - 1] = src32[0]; + } +} + +void SplitUVRow_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) { + int x; + for (x = 0; x < width - 1; x += 2) { + dst_u[x] = src_uv[0]; + dst_u[x + 1] = src_uv[2]; + dst_v[x] = src_uv[1]; + dst_v[x + 1] = src_uv[3]; + src_uv += 4; + } + if (width & 1) { + dst_u[width - 1] = src_uv[0]; + dst_v[width - 1] = src_uv[1]; + } +} + +void MergeUVRow_C(const uint8* src_u, const uint8* src_v, uint8* dst_uv, + int width) { + int x; + for (x = 0; x < width - 1; x += 2) { + dst_uv[0] = src_u[x]; + dst_uv[1] = src_v[x]; + dst_uv[2] = src_u[x + 1]; + dst_uv[3] = src_v[x + 1]; + dst_uv += 4; + } + if (width & 1) { + dst_uv[0] = src_u[width - 1]; + dst_uv[1] = src_v[width - 1]; + } +} + +void CopyRow_C(const uint8* src, uint8* dst, int count) { + memcpy(dst, src, count); +} + +void CopyRow_16_C(const uint16* src, uint16* dst, int count) { + memcpy(dst, src, count * 2); +} + +void SetRow_C(uint8* dst, uint32 v8, int count) { +#ifdef _MSC_VER + // VC will generate rep stosb. + int x; + for (x = 0; x < count; ++x) { + dst[x] = v8; + } +#else + memset(dst, v8, count); +#endif +} + +void ARGBSetRows_C(uint8* dst, uint32 v32, int width, + int dst_stride, int height) { + int y; + for (y = 0; y < height; ++y) { + uint32* d = (uint32*)(dst); + int x; + for (x = 0; x < width; ++x) { + d[x] = v32; + } + dst += dst_stride; + } +} + +// Filter 2 rows of YUY2 UV's (422) into U and V (420). +void YUY2ToUVRow_C(const uint8* src_yuy2, int src_stride_yuy2, + uint8* dst_u, uint8* dst_v, int width) { + // Output a row of UV values, filtering 2 rows of YUY2. + int x; + for (x = 0; x < width; x += 2) { + dst_u[0] = (src_yuy2[1] + src_yuy2[src_stride_yuy2 + 1] + 1) >> 1; + dst_v[0] = (src_yuy2[3] + src_yuy2[src_stride_yuy2 + 3] + 1) >> 1; + src_yuy2 += 4; + dst_u += 1; + dst_v += 1; + } +} + +// Copy row of YUY2 UV's (422) into U and V (422). +void YUY2ToUV422Row_C(const uint8* src_yuy2, + uint8* dst_u, uint8* dst_v, int width) { + // Output a row of UV values. + int x; + for (x = 0; x < width; x += 2) { + dst_u[0] = src_yuy2[1]; + dst_v[0] = src_yuy2[3]; + src_yuy2 += 4; + dst_u += 1; + dst_v += 1; + } +} + +// Copy row of YUY2 Y's (422) into Y (420/422). +void YUY2ToYRow_C(const uint8* src_yuy2, uint8* dst_y, int width) { + // Output a row of Y values. + int x; + for (x = 0; x < width - 1; x += 2) { + dst_y[x] = src_yuy2[0]; + dst_y[x + 1] = src_yuy2[2]; + src_yuy2 += 4; + } + if (width & 1) { + dst_y[width - 1] = src_yuy2[0]; + } +} + +// Filter 2 rows of UYVY UV's (422) into U and V (420). +void UYVYToUVRow_C(const uint8* src_uyvy, int src_stride_uyvy, + uint8* dst_u, uint8* dst_v, int width) { + // Output a row of UV values. + int x; + for (x = 0; x < width; x += 2) { + dst_u[0] = (src_uyvy[0] + src_uyvy[src_stride_uyvy + 0] + 1) >> 1; + dst_v[0] = (src_uyvy[2] + src_uyvy[src_stride_uyvy + 2] + 1) >> 1; + src_uyvy += 4; + dst_u += 1; + dst_v += 1; + } +} + +// Copy row of UYVY UV's (422) into U and V (422). +void UYVYToUV422Row_C(const uint8* src_uyvy, + uint8* dst_u, uint8* dst_v, int width) { + // Output a row of UV values. + int x; + for (x = 0; x < width; x += 2) { + dst_u[0] = src_uyvy[0]; + dst_v[0] = src_uyvy[2]; + src_uyvy += 4; + dst_u += 1; + dst_v += 1; + } +} + +// Copy row of UYVY Y's (422) into Y (420/422). +void UYVYToYRow_C(const uint8* src_uyvy, uint8* dst_y, int width) { + // Output a row of Y values. + int x; + for (x = 0; x < width - 1; x += 2) { + dst_y[x] = src_uyvy[1]; + dst_y[x + 1] = src_uyvy[3]; + src_uyvy += 4; + } + if (width & 1) { + dst_y[width - 1] = src_uyvy[1]; + } +} + +#define BLEND(f, b, a) (((256 - a) * b) >> 8) + f + +// Blend src_argb0 over src_argb1 and store to dst_argb. +// dst_argb may be src_argb0 or src_argb1. +// This code mimics the SSSE3 version for better testability. +void ARGBBlendRow_C(const uint8* src_argb0, const uint8* src_argb1, + uint8* dst_argb, int width) { + int x; + for (x = 0; x < width - 1; x += 2) { + uint32 fb = src_argb0[0]; + uint32 fg = src_argb0[1]; + uint32 fr = src_argb0[2]; + uint32 a = src_argb0[3]; + uint32 bb = src_argb1[0]; + uint32 bg = src_argb1[1]; + uint32 br = src_argb1[2]; + dst_argb[0] = BLEND(fb, bb, a); + dst_argb[1] = BLEND(fg, bg, a); + dst_argb[2] = BLEND(fr, br, a); + dst_argb[3] = 255u; + + fb = src_argb0[4 + 0]; + fg = src_argb0[4 + 1]; + fr = src_argb0[4 + 2]; + a = src_argb0[4 + 3]; + bb = src_argb1[4 + 0]; + bg = src_argb1[4 + 1]; + br = src_argb1[4 + 2]; + dst_argb[4 + 0] = BLEND(fb, bb, a); + dst_argb[4 + 1] = BLEND(fg, bg, a); + dst_argb[4 + 2] = BLEND(fr, br, a); + dst_argb[4 + 3] = 255u; + src_argb0 += 8; + src_argb1 += 8; + dst_argb += 8; + } + + if (width & 1) { + uint32 fb = src_argb0[0]; + uint32 fg = src_argb0[1]; + uint32 fr = src_argb0[2]; + uint32 a = src_argb0[3]; + uint32 bb = src_argb1[0]; + uint32 bg = src_argb1[1]; + uint32 br = src_argb1[2]; + dst_argb[0] = BLEND(fb, bb, a); + dst_argb[1] = BLEND(fg, bg, a); + dst_argb[2] = BLEND(fr, br, a); + dst_argb[3] = 255u; + } +} +#undef BLEND +#define ATTENUATE(f, a) (a | (a << 8)) * (f | (f << 8)) >> 24 + +// Multiply source RGB by alpha and store to destination. +// This code mimics the SSSE3 version for better testability. +void ARGBAttenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width) { + int i; + for (i = 0; i < width - 1; i += 2) { + uint32 b = src_argb[0]; + uint32 g = src_argb[1]; + uint32 r = src_argb[2]; + uint32 a = src_argb[3]; + dst_argb[0] = ATTENUATE(b, a); + dst_argb[1] = ATTENUATE(g, a); + dst_argb[2] = ATTENUATE(r, a); + dst_argb[3] = a; + b = src_argb[4]; + g = src_argb[5]; + r = src_argb[6]; + a = src_argb[7]; + dst_argb[4] = ATTENUATE(b, a); + dst_argb[5] = ATTENUATE(g, a); + dst_argb[6] = ATTENUATE(r, a); + dst_argb[7] = a; + src_argb += 8; + dst_argb += 8; + } + + if (width & 1) { + const uint32 b = src_argb[0]; + const uint32 g = src_argb[1]; + const uint32 r = src_argb[2]; + const uint32 a = src_argb[3]; + dst_argb[0] = ATTENUATE(b, a); + dst_argb[1] = ATTENUATE(g, a); + dst_argb[2] = ATTENUATE(r, a); + dst_argb[3] = a; + } +} +#undef ATTENUATE + +// Divide source RGB by alpha and store to destination. +// b = (b * 255 + (a / 2)) / a; +// g = (g * 255 + (a / 2)) / a; +// r = (r * 255 + (a / 2)) / a; +// Reciprocal method is off by 1 on some values. ie 125 +// 8.8 fixed point inverse table with 1.0 in upper short and 1 / a in lower. +#define T(a) 0x01000000 + (0x10000 / a) +const uint32 fixed_invtbl8[256] = { + 0x01000000, 0x0100ffff, T(0x02), T(0x03), T(0x04), T(0x05), T(0x06), T(0x07), + T(0x08), T(0x09), T(0x0a), T(0x0b), T(0x0c), T(0x0d), T(0x0e), T(0x0f), + T(0x10), T(0x11), T(0x12), T(0x13), T(0x14), T(0x15), T(0x16), T(0x17), + T(0x18), T(0x19), T(0x1a), T(0x1b), T(0x1c), T(0x1d), T(0x1e), T(0x1f), + T(0x20), T(0x21), T(0x22), T(0x23), T(0x24), T(0x25), T(0x26), T(0x27), + T(0x28), T(0x29), T(0x2a), T(0x2b), T(0x2c), T(0x2d), T(0x2e), T(0x2f), + T(0x30), T(0x31), T(0x32), T(0x33), T(0x34), T(0x35), T(0x36), T(0x37), + T(0x38), T(0x39), T(0x3a), T(0x3b), T(0x3c), T(0x3d), T(0x3e), T(0x3f), + T(0x40), T(0x41), T(0x42), T(0x43), T(0x44), T(0x45), T(0x46), T(0x47), + T(0x48), T(0x49), T(0x4a), T(0x4b), T(0x4c), T(0x4d), T(0x4e), T(0x4f), + T(0x50), T(0x51), T(0x52), T(0x53), T(0x54), T(0x55), T(0x56), T(0x57), + T(0x58), T(0x59), T(0x5a), T(0x5b), T(0x5c), T(0x5d), T(0x5e), T(0x5f), + T(0x60), T(0x61), T(0x62), T(0x63), T(0x64), T(0x65), T(0x66), T(0x67), + T(0x68), T(0x69), T(0x6a), T(0x6b), T(0x6c), T(0x6d), T(0x6e), T(0x6f), + T(0x70), T(0x71), T(0x72), T(0x73), T(0x74), T(0x75), T(0x76), T(0x77), + T(0x78), T(0x79), T(0x7a), T(0x7b), T(0x7c), T(0x7d), T(0x7e), T(0x7f), + T(0x80), T(0x81), T(0x82), T(0x83), T(0x84), T(0x85), T(0x86), T(0x87), + T(0x88), T(0x89), T(0x8a), T(0x8b), T(0x8c), T(0x8d), T(0x8e), T(0x8f), + T(0x90), T(0x91), T(0x92), T(0x93), T(0x94), T(0x95), T(0x96), T(0x97), + T(0x98), T(0x99), T(0x9a), T(0x9b), T(0x9c), T(0x9d), T(0x9e), T(0x9f), + T(0xa0), T(0xa1), T(0xa2), T(0xa3), T(0xa4), T(0xa5), T(0xa6), T(0xa7), + T(0xa8), T(0xa9), T(0xaa), T(0xab), T(0xac), T(0xad), T(0xae), T(0xaf), + T(0xb0), T(0xb1), T(0xb2), T(0xb3), T(0xb4), T(0xb5), T(0xb6), T(0xb7), + T(0xb8), T(0xb9), T(0xba), T(0xbb), T(0xbc), T(0xbd), T(0xbe), T(0xbf), + T(0xc0), T(0xc1), T(0xc2), T(0xc3), T(0xc4), T(0xc5), T(0xc6), T(0xc7), + T(0xc8), T(0xc9), T(0xca), T(0xcb), T(0xcc), T(0xcd), T(0xce), T(0xcf), + T(0xd0), T(0xd1), T(0xd2), T(0xd3), T(0xd4), T(0xd5), T(0xd6), T(0xd7), + T(0xd8), T(0xd9), T(0xda), T(0xdb), T(0xdc), T(0xdd), T(0xde), T(0xdf), + T(0xe0), T(0xe1), T(0xe2), T(0xe3), T(0xe4), T(0xe5), T(0xe6), T(0xe7), + T(0xe8), T(0xe9), T(0xea), T(0xeb), T(0xec), T(0xed), T(0xee), T(0xef), + T(0xf0), T(0xf1), T(0xf2), T(0xf3), T(0xf4), T(0xf5), T(0xf6), T(0xf7), + T(0xf8), T(0xf9), T(0xfa), T(0xfb), T(0xfc), T(0xfd), T(0xfe), 0x01000100 }; +#undef T + +void ARGBUnattenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width) { + int i; + for (i = 0; i < width; ++i) { + uint32 b = src_argb[0]; + uint32 g = src_argb[1]; + uint32 r = src_argb[2]; + const uint32 a = src_argb[3]; + const uint32 ia = fixed_invtbl8[a] & 0xffff; // 8.8 fixed point + b = (b * ia) >> 8; + g = (g * ia) >> 8; + r = (r * ia) >> 8; + // Clamping should not be necessary but is free in assembly. + dst_argb[0] = clamp255(b); + dst_argb[1] = clamp255(g); + dst_argb[2] = clamp255(r); + dst_argb[3] = a; + src_argb += 4; + dst_argb += 4; + } +} + +void ComputeCumulativeSumRow_C(const uint8* row, int32* cumsum, + const int32* previous_cumsum, int width) { + int32 row_sum[4] = {0, 0, 0, 0}; + int x; + for (x = 0; x < width; ++x) { + row_sum[0] += row[x * 4 + 0]; + row_sum[1] += row[x * 4 + 1]; + row_sum[2] += row[x * 4 + 2]; + row_sum[3] += row[x * 4 + 3]; + cumsum[x * 4 + 0] = row_sum[0] + previous_cumsum[x * 4 + 0]; + cumsum[x * 4 + 1] = row_sum[1] + previous_cumsum[x * 4 + 1]; + cumsum[x * 4 + 2] = row_sum[2] + previous_cumsum[x * 4 + 2]; + cumsum[x * 4 + 3] = row_sum[3] + previous_cumsum[x * 4 + 3]; + } +} + +void CumulativeSumToAverageRow_C(const int32* tl, const int32* bl, + int w, int area, uint8* dst, int count) { + float ooa = 1.0f / area; + int i; + for (i = 0; i < count; ++i) { + dst[0] = (uint8)((bl[w + 0] + tl[0] - bl[0] - tl[w + 0]) * ooa); + dst[1] = (uint8)((bl[w + 1] + tl[1] - bl[1] - tl[w + 1]) * ooa); + dst[2] = (uint8)((bl[w + 2] + tl[2] - bl[2] - tl[w + 2]) * ooa); + dst[3] = (uint8)((bl[w + 3] + tl[3] - bl[3] - tl[w + 3]) * ooa); + dst += 4; + tl += 4; + bl += 4; + } +} + +// Copy pixels from rotated source to destination row with a slope. +LIBYUV_API +void ARGBAffineRow_C(const uint8* src_argb, int src_argb_stride, + uint8* dst_argb, const float* uv_dudv, int width) { + int i; + // Render a row of pixels from source into a buffer. + float uv[2]; + uv[0] = uv_dudv[0]; + uv[1] = uv_dudv[1]; + for (i = 0; i < width; ++i) { + int x = (int)(uv[0]); + int y = (int)(uv[1]); + *(uint32*)(dst_argb) = + *(const uint32*)(src_argb + y * src_argb_stride + + x * 4); + dst_argb += 4; + uv[0] += uv_dudv[2]; + uv[1] += uv_dudv[3]; + } +} + +// Blend 2 rows into 1 for conversions such as I422ToI420. +void HalfRow_C(const uint8* src_uv, int src_uv_stride, + uint8* dst_uv, int pix) { + int x; + for (x = 0; x < pix; ++x) { + dst_uv[x] = (src_uv[x] + src_uv[src_uv_stride + x] + 1) >> 1; + } +} + +void HalfRow_16_C(const uint16* src_uv, int src_uv_stride, + uint16* dst_uv, int pix) { + int x; + for (x = 0; x < pix; ++x) { + dst_uv[x] = (src_uv[x] + src_uv[src_uv_stride + x] + 1) >> 1; + } +} + +// C version 2x2 -> 2x1. +void InterpolateRow_C(uint8* dst_ptr, const uint8* src_ptr, + ptrdiff_t src_stride, + int width, int source_y_fraction) { + int y1_fraction = source_y_fraction; + int y0_fraction = 256 - y1_fraction; + const uint8* src_ptr1 = src_ptr + src_stride; + int x; + if (source_y_fraction == 0) { + memcpy(dst_ptr, src_ptr, width); + return; + } + if (source_y_fraction == 128) { + HalfRow_C(src_ptr, (int)(src_stride), dst_ptr, width); + return; + } + for (x = 0; x < width - 1; x += 2) { + dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8; + dst_ptr[1] = (src_ptr[1] * y0_fraction + src_ptr1[1] * y1_fraction) >> 8; + src_ptr += 2; + src_ptr1 += 2; + dst_ptr += 2; + } + if (width & 1) { + dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8; + } +} + +void InterpolateRow_16_C(uint16* dst_ptr, const uint16* src_ptr, + ptrdiff_t src_stride, + int width, int source_y_fraction) { + int y1_fraction = source_y_fraction; + int y0_fraction = 256 - y1_fraction; + const uint16* src_ptr1 = src_ptr + src_stride; + int x; + if (source_y_fraction == 0) { + memcpy(dst_ptr, src_ptr, width * 2); + return; + } + if (source_y_fraction == 128) { + HalfRow_16_C(src_ptr, (int)(src_stride), dst_ptr, width); + return; + } + for (x = 0; x < width - 1; x += 2) { + dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8; + dst_ptr[1] = (src_ptr[1] * y0_fraction + src_ptr1[1] * y1_fraction) >> 8; + src_ptr += 2; + src_ptr1 += 2; + dst_ptr += 2; + } + if (width & 1) { + dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8; + } +} + +// Select 2 channels from ARGB on alternating pixels. e.g. BGBGBGBG +void ARGBToBayerRow_C(const uint8* src_argb, + uint8* dst_bayer, uint32 selector, int pix) { + int index0 = selector & 0xff; + int index1 = (selector >> 8) & 0xff; + // Copy a row of Bayer. + int x; + for (x = 0; x < pix - 1; x += 2) { + dst_bayer[0] = src_argb[index0]; + dst_bayer[1] = src_argb[index1]; + src_argb += 8; + dst_bayer += 2; + } + if (pix & 1) { + dst_bayer[0] = src_argb[index0]; + } +} + +// Select G channel from ARGB. e.g. GGGGGGGG +void ARGBToBayerGGRow_C(const uint8* src_argb, + uint8* dst_bayer, uint32 selector, int pix) { + // Copy a row of G. + int x; + for (x = 0; x < pix - 1; x += 2) { + dst_bayer[0] = src_argb[1]; + dst_bayer[1] = src_argb[5]; + src_argb += 8; + dst_bayer += 2; + } + if (pix & 1) { + dst_bayer[0] = src_argb[1]; + } +} + +// Use first 4 shuffler values to reorder ARGB channels. +void ARGBShuffleRow_C(const uint8* src_argb, uint8* dst_argb, + const uint8* shuffler, int pix) { + int index0 = shuffler[0]; + int index1 = shuffler[1]; + int index2 = shuffler[2]; + int index3 = shuffler[3]; + // Shuffle a row of ARGB. + int x; + for (x = 0; x < pix; ++x) { + // To support in-place conversion. + uint8 b = src_argb[index0]; + uint8 g = src_argb[index1]; + uint8 r = src_argb[index2]; + uint8 a = src_argb[index3]; + dst_argb[0] = b; + dst_argb[1] = g; + dst_argb[2] = r; + dst_argb[3] = a; + src_argb += 4; + dst_argb += 4; + } +} + +void I422ToYUY2Row_C(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_frame, int width) { + int x; + for (x = 0; x < width - 1; x += 2) { + dst_frame[0] = src_y[0]; + dst_frame[1] = src_u[0]; + dst_frame[2] = src_y[1]; + dst_frame[3] = src_v[0]; + dst_frame += 4; + src_y += 2; + src_u += 1; + src_v += 1; + } + if (width & 1) { + dst_frame[0] = src_y[0]; + dst_frame[1] = src_u[0]; + dst_frame[2] = src_y[0]; // duplicate last y + dst_frame[3] = src_v[0]; + } +} + +void I422ToUYVYRow_C(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_frame, int width) { + int x; + for (x = 0; x < width - 1; x += 2) { + dst_frame[0] = src_u[0]; + dst_frame[1] = src_y[0]; + dst_frame[2] = src_v[0]; + dst_frame[3] = src_y[1]; + dst_frame += 4; + src_y += 2; + src_u += 1; + src_v += 1; + } + if (width & 1) { + dst_frame[0] = src_u[0]; + dst_frame[1] = src_y[0]; + dst_frame[2] = src_v[0]; + dst_frame[3] = src_y[0]; // duplicate last y + } +} + +#if !defined(LIBYUV_DISABLE_X86) && defined(HAS_I422TOARGBROW_SSSE3) +// row_win.cc has asm version, but GCC uses 2 step wrapper. +#if !defined(_MSC_VER) && (defined(__x86_64__) || defined(__i386__)) +void I422ToRGB565Row_SSSE3(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* rgb_buf, + int width) { + // Allocate a row of ARGB. + align_buffer_64(row, width * 4); + I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, width); + ARGBToRGB565Row_SSE2(row, rgb_buf, width); + free_aligned_buffer_64(row); +} +#endif // !defined(_MSC_VER) && (defined(__x86_64__) || defined(__i386__)) + +#if defined(_M_IX86) || defined(__x86_64__) || defined(__i386__) +void I422ToARGB1555Row_SSSE3(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* rgb_buf, + int width) { + // Allocate a row of ARGB. + align_buffer_64(row, width * 4); + I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, width); + ARGBToARGB1555Row_SSE2(row, rgb_buf, width); + free_aligned_buffer_64(row); +} + +void I422ToARGB4444Row_SSSE3(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* rgb_buf, + int width) { + // Allocate a row of ARGB. + align_buffer_64(row, width * 4); + I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, width); + ARGBToARGB4444Row_SSE2(row, rgb_buf, width); + free_aligned_buffer_64(row); +} + +void NV12ToRGB565Row_SSSE3(const uint8* src_y, + const uint8* src_uv, + uint8* dst_rgb565, + int width) { + // Allocate a row of ARGB. + align_buffer_64(row, width * 4); + NV12ToARGBRow_SSSE3(src_y, src_uv, row, width); + ARGBToRGB565Row_SSE2(row, dst_rgb565, width); + free_aligned_buffer_64(row); +} + +void NV21ToRGB565Row_SSSE3(const uint8* src_y, + const uint8* src_vu, + uint8* dst_rgb565, + int width) { + // Allocate a row of ARGB. + align_buffer_64(row, width * 4); + NV21ToARGBRow_SSSE3(src_y, src_vu, row, width); + ARGBToRGB565Row_SSE2(row, dst_rgb565, width); + free_aligned_buffer_64(row); +} + +void YUY2ToARGBRow_SSSE3(const uint8* src_yuy2, + uint8* dst_argb, + int width) { + // Allocate a rows of yuv. + align_buffer_64(row_y, ((width + 63) & ~63) * 2); + uint8* row_u = row_y + ((width + 63) & ~63); + uint8* row_v = row_u + ((width + 63) & ~63) / 2; + YUY2ToUV422Row_SSE2(src_yuy2, row_u, row_v, width); + YUY2ToYRow_SSE2(src_yuy2, row_y, width); + I422ToARGBRow_SSSE3(row_y, row_u, row_v, dst_argb, width); + free_aligned_buffer_64(row_y); +} + +void YUY2ToARGBRow_Unaligned_SSSE3(const uint8* src_yuy2, + uint8* dst_argb, + int width) { + // Allocate a rows of yuv. + align_buffer_64(row_y, ((width + 63) & ~63) * 2); + uint8* row_u = row_y + ((width + 63) & ~63); + uint8* row_v = row_u + ((width + 63) & ~63) / 2; + YUY2ToUV422Row_Unaligned_SSE2(src_yuy2, row_u, row_v, width); + YUY2ToYRow_Unaligned_SSE2(src_yuy2, row_y, width); + I422ToARGBRow_Unaligned_SSSE3(row_y, row_u, row_v, dst_argb, width); + free_aligned_buffer_64(row_y); +} + +void UYVYToARGBRow_SSSE3(const uint8* src_uyvy, + uint8* dst_argb, + int width) { + // Allocate a rows of yuv. + align_buffer_64(row_y, ((width + 63) & ~63) * 2); + uint8* row_u = row_y + ((width + 63) & ~63); + uint8* row_v = row_u + ((width + 63) & ~63) / 2; + UYVYToUV422Row_SSE2(src_uyvy, row_u, row_v, width); + UYVYToYRow_SSE2(src_uyvy, row_y, width); + I422ToARGBRow_SSSE3(row_y, row_u, row_v, dst_argb, width); + free_aligned_buffer_64(row_y); +} + +void UYVYToARGBRow_Unaligned_SSSE3(const uint8* src_uyvy, + uint8* dst_argb, + int width) { + // Allocate a rows of yuv. + align_buffer_64(row_y, ((width + 63) & ~63) * 2); + uint8* row_u = row_y + ((width + 63) & ~63); + uint8* row_v = row_u + ((width + 63) & ~63) / 2; + UYVYToUV422Row_Unaligned_SSE2(src_uyvy, row_u, row_v, width); + UYVYToYRow_Unaligned_SSE2(src_uyvy, row_y, width); + I422ToARGBRow_Unaligned_SSSE3(row_y, row_u, row_v, dst_argb, width); + free_aligned_buffer_64(row_y); +} + +#endif // defined(_M_IX86) || defined(__x86_64__) || defined(__i386__) +#endif // !defined(LIBYUV_DISABLE_X86) + +void ARGBPolynomialRow_C(const uint8* src_argb, + uint8* dst_argb, const float* poly, + int width) { + int i; + for (i = 0; i < width; ++i) { + float b = (float)(src_argb[0]); + float g = (float)(src_argb[1]); + float r = (float)(src_argb[2]); + float a = (float)(src_argb[3]); + float b2 = b * b; + float g2 = g * g; + float r2 = r * r; + float a2 = a * a; + float db = poly[0] + poly[4] * b; + float dg = poly[1] + poly[5] * g; + float dr = poly[2] + poly[6] * r; + float da = poly[3] + poly[7] * a; + float b3 = b2 * b; + float g3 = g2 * g; + float r3 = r2 * r; + float a3 = a2 * a; + db += poly[8] * b2; + dg += poly[9] * g2; + dr += poly[10] * r2; + da += poly[11] * a2; + db += poly[12] * b3; + dg += poly[13] * g3; + dr += poly[14] * r3; + da += poly[15] * a3; + + dst_argb[0] = Clamp((int32)(db)); + dst_argb[1] = Clamp((int32)(dg)); + dst_argb[2] = Clamp((int32)(dr)); + dst_argb[3] = Clamp((int32)(da)); + src_argb += 4; + dst_argb += 4; + } +} + +void ARGBLumaColorTableRow_C(const uint8* src_argb, uint8* dst_argb, int width, + const uint8* luma, uint32 lumacoeff) { + uint32 bc = lumacoeff & 0xff; + uint32 gc = (lumacoeff >> 8) & 0xff; + uint32 rc = (lumacoeff >> 16) & 0xff; + + int i; + for (i = 0; i < width - 1; i += 2) { + // Luminance in rows, color values in columns. + const uint8* luma0 = ((src_argb[0] * bc + src_argb[1] * gc + + src_argb[2] * rc) & 0x7F00u) + luma; + const uint8* luma1; + dst_argb[0] = luma0[src_argb[0]]; + dst_argb[1] = luma0[src_argb[1]]; + dst_argb[2] = luma0[src_argb[2]]; + dst_argb[3] = src_argb[3]; + luma1 = ((src_argb[4] * bc + src_argb[5] * gc + + src_argb[6] * rc) & 0x7F00u) + luma; + dst_argb[4] = luma1[src_argb[4]]; + dst_argb[5] = luma1[src_argb[5]]; + dst_argb[6] = luma1[src_argb[6]]; + dst_argb[7] = src_argb[7]; + src_argb += 8; + dst_argb += 8; + } + if (width & 1) { + // Luminance in rows, color values in columns. + const uint8* luma0 = ((src_argb[0] * bc + src_argb[1] * gc + + src_argb[2] * rc) & 0x7F00u) + luma; + dst_argb[0] = luma0[src_argb[0]]; + dst_argb[1] = luma0[src_argb[1]]; + dst_argb[2] = luma0[src_argb[2]]; + dst_argb[3] = src_argb[3]; + } +} + +void ARGBCopyAlphaRow_C(const uint8* src, uint8* dst, int width) { + int i; + for (i = 0; i < width - 1; i += 2) { + dst[3] = src[3]; + dst[7] = src[7]; + dst += 8; + src += 8; + } + if (width & 1) { + dst[3] = src[3]; + } +} + +void ARGBCopyYToAlphaRow_C(const uint8* src, uint8* dst, int width) { + int i; + for (i = 0; i < width - 1; i += 2) { + dst[3] = src[0]; + dst[7] = src[1]; + dst += 8; + src += 2; + } + if (width & 1) { + dst[3] = src[0]; + } +} + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/TMessagesProj/jni/libyuv/source/row_mips.cc b/TMessagesProj/jni/libyuv/source/row_mips.cc new file mode 100644 index 000000000..da7183bc1 --- /dev/null +++ b/TMessagesProj/jni/libyuv/source/row_mips.cc @@ -0,0 +1,994 @@ +/* + * Copyright (c) 2012 The LibYuv project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/row.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// The following are available on Mips platforms: +#if !defined(LIBYUV_DISABLE_MIPS) && defined(__mips__) && \ + (_MIPS_SIM == _MIPS_SIM_ABI32) + +#ifdef HAS_COPYROW_MIPS +void CopyRow_MIPS(const uint8* src, uint8* dst, int count) { + __asm__ __volatile__ ( + ".set noreorder \n" + ".set noat \n" + "slti $at, %[count], 8 \n" + "bne $at ,$zero, $last8 \n" + "xor $t8, %[src], %[dst] \n" + "andi $t8, $t8, 0x3 \n" + + "bne $t8, $zero, unaligned \n" + "negu $a3, %[dst] \n" + // make dst/src aligned + "andi $a3, $a3, 0x3 \n" + "beq $a3, $zero, $chk16w \n" + // word-aligned now count is the remining bytes count + "subu %[count], %[count], $a3 \n" + + "lwr $t8, 0(%[src]) \n" + "addu %[src], %[src], $a3 \n" + "swr $t8, 0(%[dst]) \n" + "addu %[dst], %[dst], $a3 \n" + + // Now the dst/src are mutually word-aligned with word-aligned addresses + "$chk16w: \n" + "andi $t8, %[count], 0x3f \n" // whole 64-B chunks? + // t8 is the byte count after 64-byte chunks + "beq %[count], $t8, chk8w \n" + // There will be at most 1 32-byte chunk after it + "subu $a3, %[count], $t8 \n" // the reminder + // Here a3 counts bytes in 16w chunks + "addu $a3, %[dst], $a3 \n" + // Now a3 is the final dst after 64-byte chunks + "addu $t0, %[dst], %[count] \n" + // t0 is the "past the end" address + + // When in the loop we exercise "pref 30,x(a1)", the a1+x should not be past + // the "t0-32" address + // This means: for x=128 the last "safe" a1 address is "t0-160" + // Alternatively, for x=64 the last "safe" a1 address is "t0-96" + // we will use "pref 30,128(a1)", so "t0-160" is the limit + "subu $t9, $t0, 160 \n" + // t9 is the "last safe pref 30,128(a1)" address + "pref 0, 0(%[src]) \n" // first line of src + "pref 0, 32(%[src]) \n" // second line of src + "pref 0, 64(%[src]) \n" + "pref 30, 32(%[dst]) \n" + // In case the a1 > t9 don't use "pref 30" at all + "sgtu $v1, %[dst], $t9 \n" + "bgtz $v1, $loop16w \n" + "nop \n" + // otherwise, start with using pref30 + "pref 30, 64(%[dst]) \n" + "$loop16w: \n" + "pref 0, 96(%[src]) \n" + "lw $t0, 0(%[src]) \n" + "bgtz $v1, $skip_pref30_96 \n" // skip + "lw $t1, 4(%[src]) \n" + "pref 30, 96(%[dst]) \n" // continue + "$skip_pref30_96: \n" + "lw $t2, 8(%[src]) \n" + "lw $t3, 12(%[src]) \n" + "lw $t4, 16(%[src]) \n" + "lw $t5, 20(%[src]) \n" + "lw $t6, 24(%[src]) \n" + "lw $t7, 28(%[src]) \n" + "pref 0, 128(%[src]) \n" + // bring the next lines of src, addr 128 + "sw $t0, 0(%[dst]) \n" + "sw $t1, 4(%[dst]) \n" + "sw $t2, 8(%[dst]) \n" + "sw $t3, 12(%[dst]) \n" + "sw $t4, 16(%[dst]) \n" + "sw $t5, 20(%[dst]) \n" + "sw $t6, 24(%[dst]) \n" + "sw $t7, 28(%[dst]) \n" + "lw $t0, 32(%[src]) \n" + "bgtz $v1, $skip_pref30_128 \n" // skip pref 30,128(a1) + "lw $t1, 36(%[src]) \n" + "pref 30, 128(%[dst]) \n" // set dest, addr 128 + "$skip_pref30_128: \n" + "lw $t2, 40(%[src]) \n" + "lw $t3, 44(%[src]) \n" + "lw $t4, 48(%[src]) \n" + "lw $t5, 52(%[src]) \n" + "lw $t6, 56(%[src]) \n" + "lw $t7, 60(%[src]) \n" + "pref 0, 160(%[src]) \n" + // bring the next lines of src, addr 160 + "sw $t0, 32(%[dst]) \n" + "sw $t1, 36(%[dst]) \n" + "sw $t2, 40(%[dst]) \n" + "sw $t3, 44(%[dst]) \n" + "sw $t4, 48(%[dst]) \n" + "sw $t5, 52(%[dst]) \n" + "sw $t6, 56(%[dst]) \n" + "sw $t7, 60(%[dst]) \n" + + "addiu %[dst], %[dst], 64 \n" // adding 64 to dest + "sgtu $v1, %[dst], $t9 \n" + "bne %[dst], $a3, $loop16w \n" + " addiu %[src], %[src], 64 \n" // adding 64 to src + "move %[count], $t8 \n" + + // Here we have src and dest word-aligned but less than 64-bytes to go + + "chk8w: \n" + "pref 0, 0x0(%[src]) \n" + "andi $t8, %[count], 0x1f \n" // 32-byte chunk? + // the t8 is the reminder count past 32-bytes + "beq %[count], $t8, chk1w \n" + // count=t8,no 32-byte chunk + " nop \n" + + "lw $t0, 0(%[src]) \n" + "lw $t1, 4(%[src]) \n" + "lw $t2, 8(%[src]) \n" + "lw $t3, 12(%[src]) \n" + "lw $t4, 16(%[src]) \n" + "lw $t5, 20(%[src]) \n" + "lw $t6, 24(%[src]) \n" + "lw $t7, 28(%[src]) \n" + "addiu %[src], %[src], 32 \n" + + "sw $t0, 0(%[dst]) \n" + "sw $t1, 4(%[dst]) \n" + "sw $t2, 8(%[dst]) \n" + "sw $t3, 12(%[dst]) \n" + "sw $t4, 16(%[dst]) \n" + "sw $t5, 20(%[dst]) \n" + "sw $t6, 24(%[dst]) \n" + "sw $t7, 28(%[dst]) \n" + "addiu %[dst], %[dst], 32 \n" + + "chk1w: \n" + "andi %[count], $t8, 0x3 \n" + // now count is the reminder past 1w chunks + "beq %[count], $t8, $last8 \n" + " subu $a3, $t8, %[count] \n" + // a3 is count of bytes in 1w chunks + "addu $a3, %[dst], $a3 \n" + // now a3 is the dst address past the 1w chunks + // copying in words (4-byte chunks) + "$wordCopy_loop: \n" + "lw $t3, 0(%[src]) \n" + // the first t3 may be equal t0 ... optimize? + "addiu %[src], %[src],4 \n" + "addiu %[dst], %[dst],4 \n" + "bne %[dst], $a3,$wordCopy_loop \n" + " sw $t3, -4(%[dst]) \n" + + // For the last (<8) bytes + "$last8: \n" + "blez %[count], leave \n" + " addu $a3, %[dst], %[count] \n" // a3 -last dst address + "$last8loop: \n" + "lb $v1, 0(%[src]) \n" + "addiu %[src], %[src], 1 \n" + "addiu %[dst], %[dst], 1 \n" + "bne %[dst], $a3, $last8loop \n" + " sb $v1, -1(%[dst]) \n" + + "leave: \n" + " j $ra \n" + " nop \n" + + // + // UNALIGNED case + // + + "unaligned: \n" + // got here with a3="negu a1" + "andi $a3, $a3, 0x3 \n" // a1 is word aligned? + "beqz $a3, $ua_chk16w \n" + " subu %[count], %[count], $a3 \n" + // bytes left after initial a3 bytes + "lwr $v1, 0(%[src]) \n" + "lwl $v1, 3(%[src]) \n" + "addu %[src], %[src], $a3 \n" // a3 may be 1, 2 or 3 + "swr $v1, 0(%[dst]) \n" + "addu %[dst], %[dst], $a3 \n" + // below the dst will be word aligned (NOTE1) + "$ua_chk16w: \n" + "andi $t8, %[count], 0x3f \n" // whole 64-B chunks? + // t8 is the byte count after 64-byte chunks + "beq %[count], $t8, ua_chk8w \n" + // if a2==t8, no 64-byte chunks + // There will be at most 1 32-byte chunk after it + "subu $a3, %[count], $t8 \n" // the reminder + // Here a3 counts bytes in 16w chunks + "addu $a3, %[dst], $a3 \n" + // Now a3 is the final dst after 64-byte chunks + "addu $t0, %[dst], %[count] \n" // t0 "past the end" + "subu $t9, $t0, 160 \n" + // t9 is the "last safe pref 30,128(a1)" address + "pref 0, 0(%[src]) \n" // first line of src + "pref 0, 32(%[src]) \n" // second line addr 32 + "pref 0, 64(%[src]) \n" + "pref 30, 32(%[dst]) \n" + // safe, as we have at least 64 bytes ahead + // In case the a1 > t9 don't use "pref 30" at all + "sgtu $v1, %[dst], $t9 \n" + "bgtz $v1, $ua_loop16w \n" + // skip "pref 30,64(a1)" for too short arrays + " nop \n" + // otherwise, start with using pref30 + "pref 30, 64(%[dst]) \n" + "$ua_loop16w: \n" + "pref 0, 96(%[src]) \n" + "lwr $t0, 0(%[src]) \n" + "lwl $t0, 3(%[src]) \n" + "lwr $t1, 4(%[src]) \n" + "bgtz $v1, $ua_skip_pref30_96 \n" + " lwl $t1, 7(%[src]) \n" + "pref 30, 96(%[dst]) \n" + // continue setting up the dest, addr 96 + "$ua_skip_pref30_96: \n" + "lwr $t2, 8(%[src]) \n" + "lwl $t2, 11(%[src]) \n" + "lwr $t3, 12(%[src]) \n" + "lwl $t3, 15(%[src]) \n" + "lwr $t4, 16(%[src]) \n" + "lwl $t4, 19(%[src]) \n" + "lwr $t5, 20(%[src]) \n" + "lwl $t5, 23(%[src]) \n" + "lwr $t6, 24(%[src]) \n" + "lwl $t6, 27(%[src]) \n" + "lwr $t7, 28(%[src]) \n" + "lwl $t7, 31(%[src]) \n" + "pref 0, 128(%[src]) \n" + // bring the next lines of src, addr 128 + "sw $t0, 0(%[dst]) \n" + "sw $t1, 4(%[dst]) \n" + "sw $t2, 8(%[dst]) \n" + "sw $t3, 12(%[dst]) \n" + "sw $t4, 16(%[dst]) \n" + "sw $t5, 20(%[dst]) \n" + "sw $t6, 24(%[dst]) \n" + "sw $t7, 28(%[dst]) \n" + "lwr $t0, 32(%[src]) \n" + "lwl $t0, 35(%[src]) \n" + "lwr $t1, 36(%[src]) \n" + "bgtz $v1, ua_skip_pref30_128 \n" + " lwl $t1, 39(%[src]) \n" + "pref 30, 128(%[dst]) \n" + // continue setting up the dest, addr 128 + "ua_skip_pref30_128: \n" + + "lwr $t2, 40(%[src]) \n" + "lwl $t2, 43(%[src]) \n" + "lwr $t3, 44(%[src]) \n" + "lwl $t3, 47(%[src]) \n" + "lwr $t4, 48(%[src]) \n" + "lwl $t4, 51(%[src]) \n" + "lwr $t5, 52(%[src]) \n" + "lwl $t5, 55(%[src]) \n" + "lwr $t6, 56(%[src]) \n" + "lwl $t6, 59(%[src]) \n" + "lwr $t7, 60(%[src]) \n" + "lwl $t7, 63(%[src]) \n" + "pref 0, 160(%[src]) \n" + // bring the next lines of src, addr 160 + "sw $t0, 32(%[dst]) \n" + "sw $t1, 36(%[dst]) \n" + "sw $t2, 40(%[dst]) \n" + "sw $t3, 44(%[dst]) \n" + "sw $t4, 48(%[dst]) \n" + "sw $t5, 52(%[dst]) \n" + "sw $t6, 56(%[dst]) \n" + "sw $t7, 60(%[dst]) \n" + + "addiu %[dst],%[dst],64 \n" // adding 64 to dest + "sgtu $v1,%[dst],$t9 \n" + "bne %[dst],$a3,$ua_loop16w \n" + " addiu %[src],%[src],64 \n" // adding 64 to src + "move %[count],$t8 \n" + + // Here we have src and dest word-aligned but less than 64-bytes to go + + "ua_chk8w: \n" + "pref 0, 0x0(%[src]) \n" + "andi $t8, %[count], 0x1f \n" // 32-byte chunk? + // the t8 is the reminder count + "beq %[count], $t8, $ua_chk1w \n" + // when count==t8, no 32-byte chunk + + "lwr $t0, 0(%[src]) \n" + "lwl $t0, 3(%[src]) \n" + "lwr $t1, 4(%[src]) \n" + "lwl $t1, 7(%[src]) \n" + "lwr $t2, 8(%[src]) \n" + "lwl $t2, 11(%[src]) \n" + "lwr $t3, 12(%[src]) \n" + "lwl $t3, 15(%[src]) \n" + "lwr $t4, 16(%[src]) \n" + "lwl $t4, 19(%[src]) \n" + "lwr $t5, 20(%[src]) \n" + "lwl $t5, 23(%[src]) \n" + "lwr $t6, 24(%[src]) \n" + "lwl $t6, 27(%[src]) \n" + "lwr $t7, 28(%[src]) \n" + "lwl $t7, 31(%[src]) \n" + "addiu %[src], %[src], 32 \n" + + "sw $t0, 0(%[dst]) \n" + "sw $t1, 4(%[dst]) \n" + "sw $t2, 8(%[dst]) \n" + "sw $t3, 12(%[dst]) \n" + "sw $t4, 16(%[dst]) \n" + "sw $t5, 20(%[dst]) \n" + "sw $t6, 24(%[dst]) \n" + "sw $t7, 28(%[dst]) \n" + "addiu %[dst], %[dst], 32 \n" + + "$ua_chk1w: \n" + "andi %[count], $t8, 0x3 \n" + // now count is the reminder past 1w chunks + "beq %[count], $t8, ua_smallCopy \n" + "subu $a3, $t8, %[count] \n" + // a3 is count of bytes in 1w chunks + "addu $a3, %[dst], $a3 \n" + // now a3 is the dst address past the 1w chunks + + // copying in words (4-byte chunks) + "$ua_wordCopy_loop: \n" + "lwr $v1, 0(%[src]) \n" + "lwl $v1, 3(%[src]) \n" + "addiu %[src], %[src], 4 \n" + "addiu %[dst], %[dst], 4 \n" + // note: dst=a1 is word aligned here, see NOTE1 + "bne %[dst], $a3, $ua_wordCopy_loop \n" + " sw $v1,-4(%[dst]) \n" + + // Now less than 4 bytes (value in count) left to copy + "ua_smallCopy: \n" + "beqz %[count], leave \n" + " addu $a3, %[dst], %[count] \n" // a3 = last dst address + "$ua_smallCopy_loop: \n" + "lb $v1, 0(%[src]) \n" + "addiu %[src], %[src], 1 \n" + "addiu %[dst], %[dst], 1 \n" + "bne %[dst],$a3,$ua_smallCopy_loop \n" + " sb $v1, -1(%[dst]) \n" + + "j $ra \n" + " nop \n" + ".set at \n" + ".set reorder \n" + : [dst] "+r" (dst), [src] "+r" (src) + : [count] "r" (count) + : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", + "t8", "t9", "a3", "v1", "at" + ); +} +#endif // HAS_COPYROW_MIPS + +// MIPS DSPR2 functions +#if !defined(LIBYUV_DISABLE_MIPS) && defined(__mips_dsp) && \ + (__mips_dsp_rev >= 2) && \ + (_MIPS_SIM == _MIPS_SIM_ABI32) && (__mips_isa_rev < 6) + +void SplitUVRow_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, + int width) { + __asm__ __volatile__ ( + ".set push \n" + ".set noreorder \n" + "srl $t4, %[width], 4 \n" // multiplies of 16 + "blez $t4, 2f \n" + " andi %[width], %[width], 0xf \n" // residual + + ".p2align 2 \n" + "1: \n" + "addiu $t4, $t4, -1 \n" + "lw $t0, 0(%[src_uv]) \n" // V1 | U1 | V0 | U0 + "lw $t1, 4(%[src_uv]) \n" // V3 | U3 | V2 | U2 + "lw $t2, 8(%[src_uv]) \n" // V5 | U5 | V4 | U4 + "lw $t3, 12(%[src_uv]) \n" // V7 | U7 | V6 | U6 + "lw $t5, 16(%[src_uv]) \n" // V9 | U9 | V8 | U8 + "lw $t6, 20(%[src_uv]) \n" // V11 | U11 | V10 | U10 + "lw $t7, 24(%[src_uv]) \n" // V13 | U13 | V12 | U12 + "lw $t8, 28(%[src_uv]) \n" // V15 | U15 | V14 | U14 + "addiu %[src_uv], %[src_uv], 32 \n" + "precrq.qb.ph $t9, $t1, $t0 \n" // V3 | V2 | V1 | V0 + "precr.qb.ph $t0, $t1, $t0 \n" // U3 | U2 | U1 | U0 + "precrq.qb.ph $t1, $t3, $t2 \n" // V7 | V6 | V5 | V4 + "precr.qb.ph $t2, $t3, $t2 \n" // U7 | U6 | U5 | U4 + "precrq.qb.ph $t3, $t6, $t5 \n" // V11 | V10 | V9 | V8 + "precr.qb.ph $t5, $t6, $t5 \n" // U11 | U10 | U9 | U8 + "precrq.qb.ph $t6, $t8, $t7 \n" // V15 | V14 | V13 | V12 + "precr.qb.ph $t7, $t8, $t7 \n" // U15 | U14 | U13 | U12 + "sw $t9, 0(%[dst_v]) \n" + "sw $t0, 0(%[dst_u]) \n" + "sw $t1, 4(%[dst_v]) \n" + "sw $t2, 4(%[dst_u]) \n" + "sw $t3, 8(%[dst_v]) \n" + "sw $t5, 8(%[dst_u]) \n" + "sw $t6, 12(%[dst_v]) \n" + "sw $t7, 12(%[dst_u]) \n" + "addiu %[dst_v], %[dst_v], 16 \n" + "bgtz $t4, 1b \n" + " addiu %[dst_u], %[dst_u], 16 \n" + + "beqz %[width], 3f \n" + " nop \n" + + "2: \n" + "lbu $t0, 0(%[src_uv]) \n" + "lbu $t1, 1(%[src_uv]) \n" + "addiu %[src_uv], %[src_uv], 2 \n" + "addiu %[width], %[width], -1 \n" + "sb $t0, 0(%[dst_u]) \n" + "sb $t1, 0(%[dst_v]) \n" + "addiu %[dst_u], %[dst_u], 1 \n" + "bgtz %[width], 2b \n" + " addiu %[dst_v], %[dst_v], 1 \n" + + "3: \n" + ".set pop \n" + : [src_uv] "+r" (src_uv), + [width] "+r" (width), + [dst_u] "+r" (dst_u), + [dst_v] "+r" (dst_v) + : + : "t0", "t1", "t2", "t3", + "t4", "t5", "t6", "t7", "t8", "t9" + ); +} + +void SplitUVRow_Unaligned_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, + uint8* dst_v, int width) { + __asm__ __volatile__ ( + ".set push \n" + ".set noreorder \n" + "srl $t4, %[width], 4 \n" // multiplies of 16 + "blez $t4, 2f \n" + " andi %[width], %[width], 0xf \n" // residual + + ".p2align 2 \n" + "1: \n" + "addiu $t4, $t4, -1 \n" + "lwr $t0, 0(%[src_uv]) \n" + "lwl $t0, 3(%[src_uv]) \n" // V1 | U1 | V0 | U0 + "lwr $t1, 4(%[src_uv]) \n" + "lwl $t1, 7(%[src_uv]) \n" // V3 | U3 | V2 | U2 + "lwr $t2, 8(%[src_uv]) \n" + "lwl $t2, 11(%[src_uv]) \n" // V5 | U5 | V4 | U4 + "lwr $t3, 12(%[src_uv]) \n" + "lwl $t3, 15(%[src_uv]) \n" // V7 | U7 | V6 | U6 + "lwr $t5, 16(%[src_uv]) \n" + "lwl $t5, 19(%[src_uv]) \n" // V9 | U9 | V8 | U8 + "lwr $t6, 20(%[src_uv]) \n" + "lwl $t6, 23(%[src_uv]) \n" // V11 | U11 | V10 | U10 + "lwr $t7, 24(%[src_uv]) \n" + "lwl $t7, 27(%[src_uv]) \n" // V13 | U13 | V12 | U12 + "lwr $t8, 28(%[src_uv]) \n" + "lwl $t8, 31(%[src_uv]) \n" // V15 | U15 | V14 | U14 + "precrq.qb.ph $t9, $t1, $t0 \n" // V3 | V2 | V1 | V0 + "precr.qb.ph $t0, $t1, $t0 \n" // U3 | U2 | U1 | U0 + "precrq.qb.ph $t1, $t3, $t2 \n" // V7 | V6 | V5 | V4 + "precr.qb.ph $t2, $t3, $t2 \n" // U7 | U6 | U5 | U4 + "precrq.qb.ph $t3, $t6, $t5 \n" // V11 | V10 | V9 | V8 + "precr.qb.ph $t5, $t6, $t5 \n" // U11 | U10 | U9 | U8 + "precrq.qb.ph $t6, $t8, $t7 \n" // V15 | V14 | V13 | V12 + "precr.qb.ph $t7, $t8, $t7 \n" // U15 | U14 | U13 | U12 + "addiu %[src_uv], %[src_uv], 32 \n" + "swr $t9, 0(%[dst_v]) \n" + "swl $t9, 3(%[dst_v]) \n" + "swr $t0, 0(%[dst_u]) \n" + "swl $t0, 3(%[dst_u]) \n" + "swr $t1, 4(%[dst_v]) \n" + "swl $t1, 7(%[dst_v]) \n" + "swr $t2, 4(%[dst_u]) \n" + "swl $t2, 7(%[dst_u]) \n" + "swr $t3, 8(%[dst_v]) \n" + "swl $t3, 11(%[dst_v]) \n" + "swr $t5, 8(%[dst_u]) \n" + "swl $t5, 11(%[dst_u]) \n" + "swr $t6, 12(%[dst_v]) \n" + "swl $t6, 15(%[dst_v]) \n" + "swr $t7, 12(%[dst_u]) \n" + "swl $t7, 15(%[dst_u]) \n" + "addiu %[dst_u], %[dst_u], 16 \n" + "bgtz $t4, 1b \n" + " addiu %[dst_v], %[dst_v], 16 \n" + + "beqz %[width], 3f \n" + " nop \n" + + "2: \n" + "lbu $t0, 0(%[src_uv]) \n" + "lbu $t1, 1(%[src_uv]) \n" + "addiu %[src_uv], %[src_uv], 2 \n" + "addiu %[width], %[width], -1 \n" + "sb $t0, 0(%[dst_u]) \n" + "sb $t1, 0(%[dst_v]) \n" + "addiu %[dst_u], %[dst_u], 1 \n" + "bgtz %[width], 2b \n" + " addiu %[dst_v], %[dst_v], 1 \n" + + "3: \n" + ".set pop \n" + : [src_uv] "+r" (src_uv), + [width] "+r" (width), + [dst_u] "+r" (dst_u), + [dst_v] "+r" (dst_v) + : + : "t0", "t1", "t2", "t3", + "t4", "t5", "t6", "t7", "t8", "t9" + ); +} + +void MirrorRow_MIPS_DSPR2(const uint8* src, uint8* dst, int width) { + __asm__ __volatile__ ( + ".set push \n" + ".set noreorder \n" + + "srl $t4, %[width], 4 \n" // multiplies of 16 + "andi $t5, %[width], 0xf \n" + "blez $t4, 2f \n" + " addu %[src], %[src], %[width] \n" // src += width + + ".p2align 2 \n" + "1: \n" + "lw $t0, -16(%[src]) \n" // |3|2|1|0| + "lw $t1, -12(%[src]) \n" // |7|6|5|4| + "lw $t2, -8(%[src]) \n" // |11|10|9|8| + "lw $t3, -4(%[src]) \n" // |15|14|13|12| + "wsbh $t0, $t0 \n" // |2|3|0|1| + "wsbh $t1, $t1 \n" // |6|7|4|5| + "wsbh $t2, $t2 \n" // |10|11|8|9| + "wsbh $t3, $t3 \n" // |14|15|12|13| + "rotr $t0, $t0, 16 \n" // |0|1|2|3| + "rotr $t1, $t1, 16 \n" // |4|5|6|7| + "rotr $t2, $t2, 16 \n" // |8|9|10|11| + "rotr $t3, $t3, 16 \n" // |12|13|14|15| + "addiu %[src], %[src], -16 \n" + "addiu $t4, $t4, -1 \n" + "sw $t3, 0(%[dst]) \n" // |15|14|13|12| + "sw $t2, 4(%[dst]) \n" // |11|10|9|8| + "sw $t1, 8(%[dst]) \n" // |7|6|5|4| + "sw $t0, 12(%[dst]) \n" // |3|2|1|0| + "bgtz $t4, 1b \n" + " addiu %[dst], %[dst], 16 \n" + "beqz $t5, 3f \n" + " nop \n" + + "2: \n" + "lbu $t0, -1(%[src]) \n" + "addiu $t5, $t5, -1 \n" + "addiu %[src], %[src], -1 \n" + "sb $t0, 0(%[dst]) \n" + "bgez $t5, 2b \n" + " addiu %[dst], %[dst], 1 \n" + + "3: \n" + ".set pop \n" + : [src] "+r" (src), [dst] "+r" (dst) + : [width] "r" (width) + : "t0", "t1", "t2", "t3", "t4", "t5" + ); +} + +void MirrorUVRow_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, + int width) { + int x = 0; + int y = 0; + __asm__ __volatile__ ( + ".set push \n" + ".set noreorder \n" + + "addu $t4, %[width], %[width] \n" + "srl %[x], %[width], 4 \n" + "andi %[y], %[width], 0xf \n" + "blez %[x], 2f \n" + " addu %[src_uv], %[src_uv], $t4 \n" + + ".p2align 2 \n" + "1: \n" + "lw $t0, -32(%[src_uv]) \n" // |3|2|1|0| + "lw $t1, -28(%[src_uv]) \n" // |7|6|5|4| + "lw $t2, -24(%[src_uv]) \n" // |11|10|9|8| + "lw $t3, -20(%[src_uv]) \n" // |15|14|13|12| + "lw $t4, -16(%[src_uv]) \n" // |19|18|17|16| + "lw $t6, -12(%[src_uv]) \n" // |23|22|21|20| + "lw $t7, -8(%[src_uv]) \n" // |27|26|25|24| + "lw $t8, -4(%[src_uv]) \n" // |31|30|29|28| + + "rotr $t0, $t0, 16 \n" // |1|0|3|2| + "rotr $t1, $t1, 16 \n" // |5|4|7|6| + "rotr $t2, $t2, 16 \n" // |9|8|11|10| + "rotr $t3, $t3, 16 \n" // |13|12|15|14| + "rotr $t4, $t4, 16 \n" // |17|16|19|18| + "rotr $t6, $t6, 16 \n" // |21|20|23|22| + "rotr $t7, $t7, 16 \n" // |25|24|27|26| + "rotr $t8, $t8, 16 \n" // |29|28|31|30| + "precr.qb.ph $t9, $t0, $t1 \n" // |0|2|4|6| + "precrq.qb.ph $t5, $t0, $t1 \n" // |1|3|5|7| + "precr.qb.ph $t0, $t2, $t3 \n" // |8|10|12|14| + "precrq.qb.ph $t1, $t2, $t3 \n" // |9|11|13|15| + "precr.qb.ph $t2, $t4, $t6 \n" // |16|18|20|22| + "precrq.qb.ph $t3, $t4, $t6 \n" // |17|19|21|23| + "precr.qb.ph $t4, $t7, $t8 \n" // |24|26|28|30| + "precrq.qb.ph $t6, $t7, $t8 \n" // |25|27|29|31| + "addiu %[src_uv], %[src_uv], -32 \n" + "addiu %[x], %[x], -1 \n" + "swr $t4, 0(%[dst_u]) \n" + "swl $t4, 3(%[dst_u]) \n" // |30|28|26|24| + "swr $t6, 0(%[dst_v]) \n" + "swl $t6, 3(%[dst_v]) \n" // |31|29|27|25| + "swr $t2, 4(%[dst_u]) \n" + "swl $t2, 7(%[dst_u]) \n" // |22|20|18|16| + "swr $t3, 4(%[dst_v]) \n" + "swl $t3, 7(%[dst_v]) \n" // |23|21|19|17| + "swr $t0, 8(%[dst_u]) \n" + "swl $t0, 11(%[dst_u]) \n" // |14|12|10|8| + "swr $t1, 8(%[dst_v]) \n" + "swl $t1, 11(%[dst_v]) \n" // |15|13|11|9| + "swr $t9, 12(%[dst_u]) \n" + "swl $t9, 15(%[dst_u]) \n" // |6|4|2|0| + "swr $t5, 12(%[dst_v]) \n" + "swl $t5, 15(%[dst_v]) \n" // |7|5|3|1| + "addiu %[dst_v], %[dst_v], 16 \n" + "bgtz %[x], 1b \n" + " addiu %[dst_u], %[dst_u], 16 \n" + "beqz %[y], 3f \n" + " nop \n" + "b 2f \n" + " nop \n" + + "2: \n" + "lbu $t0, -2(%[src_uv]) \n" + "lbu $t1, -1(%[src_uv]) \n" + "addiu %[src_uv], %[src_uv], -2 \n" + "addiu %[y], %[y], -1 \n" + "sb $t0, 0(%[dst_u]) \n" + "sb $t1, 0(%[dst_v]) \n" + "addiu %[dst_u], %[dst_u], 1 \n" + "bgtz %[y], 2b \n" + " addiu %[dst_v], %[dst_v], 1 \n" + + "3: \n" + ".set pop \n" + : [src_uv] "+r" (src_uv), + [dst_u] "+r" (dst_u), + [dst_v] "+r" (dst_v), + [x] "=&r" (x), + [y] "+r" (y) + : [width] "r" (width) + : "t0", "t1", "t2", "t3", "t4", + "t5", "t7", "t8", "t9" + ); +} + +// Convert (4 Y and 2 VU) I422 and arrange RGB values into +// t5 = | 0 | B0 | 0 | b0 | +// t4 = | 0 | B1 | 0 | b1 | +// t9 = | 0 | G0 | 0 | g0 | +// t8 = | 0 | G1 | 0 | g1 | +// t2 = | 0 | R0 | 0 | r0 | +// t1 = | 0 | R1 | 0 | r1 | +#define I422ToTransientMipsRGB \ + "lw $t0, 0(%[y_buf]) \n" \ + "lhu $t1, 0(%[u_buf]) \n" \ + "lhu $t2, 0(%[v_buf]) \n" \ + "preceu.ph.qbr $t1, $t1 \n" \ + "preceu.ph.qbr $t2, $t2 \n" \ + "preceu.ph.qbra $t3, $t0 \n" \ + "preceu.ph.qbla $t0, $t0 \n" \ + "subu.ph $t1, $t1, $s5 \n" \ + "subu.ph $t2, $t2, $s5 \n" \ + "subu.ph $t3, $t3, $s4 \n" \ + "subu.ph $t0, $t0, $s4 \n" \ + "mul.ph $t3, $t3, $s0 \n" \ + "mul.ph $t0, $t0, $s0 \n" \ + "shll.ph $t4, $t1, 0x7 \n" \ + "subu.ph $t4, $t4, $t1 \n" \ + "mul.ph $t6, $t1, $s1 \n" \ + "mul.ph $t1, $t2, $s2 \n" \ + "addq_s.ph $t5, $t4, $t3 \n" \ + "addq_s.ph $t4, $t4, $t0 \n" \ + "shra.ph $t5, $t5, 6 \n" \ + "shra.ph $t4, $t4, 6 \n" \ + "addiu %[u_buf], 2 \n" \ + "addiu %[v_buf], 2 \n" \ + "addu.ph $t6, $t6, $t1 \n" \ + "mul.ph $t1, $t2, $s3 \n" \ + "addu.ph $t9, $t6, $t3 \n" \ + "addu.ph $t8, $t6, $t0 \n" \ + "shra.ph $t9, $t9, 6 \n" \ + "shra.ph $t8, $t8, 6 \n" \ + "addu.ph $t2, $t1, $t3 \n" \ + "addu.ph $t1, $t1, $t0 \n" \ + "shra.ph $t2, $t2, 6 \n" \ + "shra.ph $t1, $t1, 6 \n" \ + "subu.ph $t5, $t5, $s5 \n" \ + "subu.ph $t4, $t4, $s5 \n" \ + "subu.ph $t9, $t9, $s5 \n" \ + "subu.ph $t8, $t8, $s5 \n" \ + "subu.ph $t2, $t2, $s5 \n" \ + "subu.ph $t1, $t1, $s5 \n" \ + "shll_s.ph $t5, $t5, 8 \n" \ + "shll_s.ph $t4, $t4, 8 \n" \ + "shll_s.ph $t9, $t9, 8 \n" \ + "shll_s.ph $t8, $t8, 8 \n" \ + "shll_s.ph $t2, $t2, 8 \n" \ + "shll_s.ph $t1, $t1, 8 \n" \ + "shra.ph $t5, $t5, 8 \n" \ + "shra.ph $t4, $t4, 8 \n" \ + "shra.ph $t9, $t9, 8 \n" \ + "shra.ph $t8, $t8, 8 \n" \ + "shra.ph $t2, $t2, 8 \n" \ + "shra.ph $t1, $t1, 8 \n" \ + "addu.ph $t5, $t5, $s5 \n" \ + "addu.ph $t4, $t4, $s5 \n" \ + "addu.ph $t9, $t9, $s5 \n" \ + "addu.ph $t8, $t8, $s5 \n" \ + "addu.ph $t2, $t2, $s5 \n" \ + "addu.ph $t1, $t1, $s5 \n" + +void I422ToARGBRow_MIPS_DSPR2(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width) { + __asm__ __volatile__ ( + ".set push \n" + ".set noreorder \n" + "beqz %[width], 2f \n" + " repl.ph $s0, 74 \n" // |YG|YG| = |74|74| + "repl.ph $s1, -25 \n" // |UG|UG| = |-25|-25| + "repl.ph $s2, -52 \n" // |VG|VG| = |-52|-52| + "repl.ph $s3, 102 \n" // |VR|VR| = |102|102| + "repl.ph $s4, 16 \n" // |0|16|0|16| + "repl.ph $s5, 128 \n" // |128|128| // clipping + "lui $s6, 0xff00 \n" + "ori $s6, 0xff00 \n" // |ff|00|ff|00|ff| + + ".p2align 2 \n" + "1: \n" + I422ToTransientMipsRGB +// Arranging into argb format + "precr.qb.ph $t4, $t8, $t4 \n" // |G1|g1|B1|b1| + "precr.qb.ph $t5, $t9, $t5 \n" // |G0|g0|B0|b0| + "addiu %[width], -4 \n" + "precrq.qb.ph $t8, $t4, $t5 \n" // |G1|B1|G0|B0| + "precr.qb.ph $t9, $t4, $t5 \n" // |g1|b1|g0|b0| + "precr.qb.ph $t2, $t1, $t2 \n" // |R1|r1|R0|r0| + + "addiu %[y_buf], 4 \n" + "preceu.ph.qbla $t1, $t2 \n" // |0 |R1|0 |R0| + "preceu.ph.qbra $t2, $t2 \n" // |0 |r1|0 |r0| + "or $t1, $t1, $s6 \n" // |ff|R1|ff|R0| + "or $t2, $t2, $s6 \n" // |ff|r1|ff|r0| + "precrq.ph.w $t0, $t2, $t9 \n" // |ff|r1|g1|b1| + "precrq.ph.w $t3, $t1, $t8 \n" // |ff|R1|G1|B1| + "sll $t9, $t9, 16 \n" + "sll $t8, $t8, 16 \n" + "packrl.ph $t2, $t2, $t9 \n" // |ff|r0|g0|b0| + "packrl.ph $t1, $t1, $t8 \n" // |ff|R0|G0|B0| +// Store results. + "sw $t2, 0(%[rgb_buf]) \n" + "sw $t0, 4(%[rgb_buf]) \n" + "sw $t1, 8(%[rgb_buf]) \n" + "sw $t3, 12(%[rgb_buf]) \n" + "bnez %[width], 1b \n" + " addiu %[rgb_buf], 16 \n" + "2: \n" + ".set pop \n" + :[y_buf] "+r" (y_buf), + [u_buf] "+r" (u_buf), + [v_buf] "+r" (v_buf), + [width] "+r" (width), + [rgb_buf] "+r" (rgb_buf) + : + : "t0", "t1", "t2", "t3", "t4", "t5", + "t6", "t7", "t8", "t9", + "s0", "s1", "s2", "s3", + "s4", "s5", "s6" + ); +} + +void I422ToABGRRow_MIPS_DSPR2(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width) { + __asm__ __volatile__ ( + ".set push \n" + ".set noreorder \n" + "beqz %[width], 2f \n" + " repl.ph $s0, 74 \n" // |YG|YG| = |74|74| + "repl.ph $s1, -25 \n" // |UG|UG| = |-25|-25| + "repl.ph $s2, -52 \n" // |VG|VG| = |-52|-52| + "repl.ph $s3, 102 \n" // |VR|VR| = |102|102| + "repl.ph $s4, 16 \n" // |0|16|0|16| + "repl.ph $s5, 128 \n" // |128|128| + "lui $s6, 0xff00 \n" + "ori $s6, 0xff00 \n" // |ff|00|ff|00| + + ".p2align 2 \n" + "1: \n" + I422ToTransientMipsRGB +// Arranging into abgr format + "precr.qb.ph $t0, $t8, $t1 \n" // |G1|g1|R1|r1| + "precr.qb.ph $t3, $t9, $t2 \n" // |G0|g0|R0|r0| + "precrq.qb.ph $t8, $t0, $t3 \n" // |G1|R1|G0|R0| + "precr.qb.ph $t9, $t0, $t3 \n" // |g1|r1|g0|r0| + + "precr.qb.ph $t2, $t4, $t5 \n" // |B1|b1|B0|b0| + "addiu %[width], -4 \n" + "addiu %[y_buf], 4 \n" + "preceu.ph.qbla $t1, $t2 \n" // |0 |B1|0 |B0| + "preceu.ph.qbra $t2, $t2 \n" // |0 |b1|0 |b0| + "or $t1, $t1, $s6 \n" // |ff|B1|ff|B0| + "or $t2, $t2, $s6 \n" // |ff|b1|ff|b0| + "precrq.ph.w $t0, $t2, $t9 \n" // |ff|b1|g1|r1| + "precrq.ph.w $t3, $t1, $t8 \n" // |ff|B1|G1|R1| + "sll $t9, $t9, 16 \n" + "sll $t8, $t8, 16 \n" + "packrl.ph $t2, $t2, $t9 \n" // |ff|b0|g0|r0| + "packrl.ph $t1, $t1, $t8 \n" // |ff|B0|G0|R0| +// Store results. + "sw $t2, 0(%[rgb_buf]) \n" + "sw $t0, 4(%[rgb_buf]) \n" + "sw $t1, 8(%[rgb_buf]) \n" + "sw $t3, 12(%[rgb_buf]) \n" + "bnez %[width], 1b \n" + " addiu %[rgb_buf], 16 \n" + "2: \n" + ".set pop \n" + :[y_buf] "+r" (y_buf), + [u_buf] "+r" (u_buf), + [v_buf] "+r" (v_buf), + [width] "+r" (width), + [rgb_buf] "+r" (rgb_buf) + : + : "t0", "t1", "t2", "t3", "t4", "t5", + "t6", "t7", "t8", "t9", + "s0", "s1", "s2", "s3", + "s4", "s5", "s6" + ); +} + +void I422ToBGRARow_MIPS_DSPR2(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width) { + __asm__ __volatile__ ( + ".set push \n" + ".set noreorder \n" + "beqz %[width], 2f \n" + " repl.ph $s0, 74 \n" // |YG|YG| = |74 |74 | + "repl.ph $s1, -25 \n" // |UG|UG| = |-25|-25| + "repl.ph $s2, -52 \n" // |VG|VG| = |-52|-52| + "repl.ph $s3, 102 \n" // |VR|VR| = |102|102| + "repl.ph $s4, 16 \n" // |0|16|0|16| + "repl.ph $s5, 128 \n" // |128|128| + "lui $s6, 0xff \n" + "ori $s6, 0xff \n" // |00|ff|00|ff| + + ".p2align 2 \n" + "1: \n" + I422ToTransientMipsRGB + // Arranging into bgra format + "precr.qb.ph $t4, $t4, $t8 \n" // |B1|b1|G1|g1| + "precr.qb.ph $t5, $t5, $t9 \n" // |B0|b0|G0|g0| + "precrq.qb.ph $t8, $t4, $t5 \n" // |B1|G1|B0|G0| + "precr.qb.ph $t9, $t4, $t5 \n" // |b1|g1|b0|g0| + + "precr.qb.ph $t2, $t1, $t2 \n" // |R1|r1|R0|r0| + "addiu %[width], -4 \n" + "addiu %[y_buf], 4 \n" + "preceu.ph.qbla $t1, $t2 \n" // |0 |R1|0 |R0| + "preceu.ph.qbra $t2, $t2 \n" // |0 |r1|0 |r0| + "sll $t1, $t1, 8 \n" // |R1|0 |R0|0 | + "sll $t2, $t2, 8 \n" // |r1|0 |r0|0 | + "or $t1, $t1, $s6 \n" // |R1|ff|R0|ff| + "or $t2, $t2, $s6 \n" // |r1|ff|r0|ff| + "precrq.ph.w $t0, $t9, $t2 \n" // |b1|g1|r1|ff| + "precrq.ph.w $t3, $t8, $t1 \n" // |B1|G1|R1|ff| + "sll $t1, $t1, 16 \n" + "sll $t2, $t2, 16 \n" + "packrl.ph $t2, $t9, $t2 \n" // |b0|g0|r0|ff| + "packrl.ph $t1, $t8, $t1 \n" // |B0|G0|R0|ff| +// Store results. + "sw $t2, 0(%[rgb_buf]) \n" + "sw $t0, 4(%[rgb_buf]) \n" + "sw $t1, 8(%[rgb_buf]) \n" + "sw $t3, 12(%[rgb_buf]) \n" + "bnez %[width], 1b \n" + " addiu %[rgb_buf], 16 \n" + "2: \n" + ".set pop \n" + :[y_buf] "+r" (y_buf), + [u_buf] "+r" (u_buf), + [v_buf] "+r" (v_buf), + [width] "+r" (width), + [rgb_buf] "+r" (rgb_buf) + : + : "t0", "t1", "t2", "t3", "t4", "t5", + "t6", "t7", "t8", "t9", + "s0", "s1", "s2", "s3", + "s4", "s5", "s6" + ); +} + +// Bilinear filter 8x2 -> 8x1 +void InterpolateRows_MIPS_DSPR2(uint8* dst_ptr, const uint8* src_ptr, + ptrdiff_t src_stride, int dst_width, + int source_y_fraction) { + int y0_fraction = 256 - source_y_fraction; + const uint8* src_ptr1 = src_ptr + src_stride; + + __asm__ __volatile__ ( + ".set push \n" + ".set noreorder \n" + + "replv.ph $t0, %[y0_fraction] \n" + "replv.ph $t1, %[source_y_fraction] \n" + + ".p2align 2 \n" + "1: \n" + "lw $t2, 0(%[src_ptr]) \n" + "lw $t3, 0(%[src_ptr1]) \n" + "lw $t4, 4(%[src_ptr]) \n" + "lw $t5, 4(%[src_ptr1]) \n" + "muleu_s.ph.qbl $t6, $t2, $t0 \n" + "muleu_s.ph.qbr $t7, $t2, $t0 \n" + "muleu_s.ph.qbl $t8, $t3, $t1 \n" + "muleu_s.ph.qbr $t9, $t3, $t1 \n" + "muleu_s.ph.qbl $t2, $t4, $t0 \n" + "muleu_s.ph.qbr $t3, $t4, $t0 \n" + "muleu_s.ph.qbl $t4, $t5, $t1 \n" + "muleu_s.ph.qbr $t5, $t5, $t1 \n" + "addq.ph $t6, $t6, $t8 \n" + "addq.ph $t7, $t7, $t9 \n" + "addq.ph $t2, $t2, $t4 \n" + "addq.ph $t3, $t3, $t5 \n" + "shra.ph $t6, $t6, 8 \n" + "shra.ph $t7, $t7, 8 \n" + "shra.ph $t2, $t2, 8 \n" + "shra.ph $t3, $t3, 8 \n" + "precr.qb.ph $t6, $t6, $t7 \n" + "precr.qb.ph $t2, $t2, $t3 \n" + "addiu %[src_ptr], %[src_ptr], 8 \n" + "addiu %[src_ptr1], %[src_ptr1], 8 \n" + "addiu %[dst_width], %[dst_width], -8 \n" + "sw $t6, 0(%[dst_ptr]) \n" + "sw $t2, 4(%[dst_ptr]) \n" + "bgtz %[dst_width], 1b \n" + " addiu %[dst_ptr], %[dst_ptr], 8 \n" + + ".set pop \n" + : [dst_ptr] "+r" (dst_ptr), + [src_ptr1] "+r" (src_ptr1), + [src_ptr] "+r" (src_ptr), + [dst_width] "+r" (dst_width) + : [source_y_fraction] "r" (source_y_fraction), + [y0_fraction] "r" (y0_fraction), + [src_stride] "r" (src_stride) + : "t0", "t1", "t2", "t3", "t4", "t5", + "t6", "t7", "t8", "t9" + ); +} +#endif // __mips_dsp_rev >= 2 + +#endif // defined(__mips__) + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/TMessagesProj/jni/libyuv/source/row_neon.cc b/TMessagesProj/jni/libyuv/source/row_neon.cc new file mode 100644 index 000000000..1392cf5fc --- /dev/null +++ b/TMessagesProj/jni/libyuv/source/row_neon.cc @@ -0,0 +1,3148 @@ +/* + * Copyright 2011 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/row.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// This module is for GCC Neon +#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) + +// Read 8 Y, 4 U and 4 V from 422 +#define READYUV422 \ + MEMACCESS(0) \ + "vld1.8 {d0}, [%0]! \n" \ + MEMACCESS(1) \ + "vld1.32 {d2[0]}, [%1]! \n" \ + MEMACCESS(2) \ + "vld1.32 {d2[1]}, [%2]! \n" + +// Read 8 Y, 2 U and 2 V from 422 +#define READYUV411 \ + MEMACCESS(0) \ + "vld1.8 {d0}, [%0]! \n" \ + MEMACCESS(1) \ + "vld1.16 {d2[0]}, [%1]! \n" \ + MEMACCESS(2) \ + "vld1.16 {d2[1]}, [%2]! \n" \ + "vmov.u8 d3, d2 \n" \ + "vzip.u8 d2, d3 \n" + +// Read 8 Y, 8 U and 8 V from 444 +#define READYUV444 \ + MEMACCESS(0) \ + "vld1.8 {d0}, [%0]! \n" \ + MEMACCESS(1) \ + "vld1.8 {d2}, [%1]! \n" \ + MEMACCESS(2) \ + "vld1.8 {d3}, [%2]! \n" \ + "vpaddl.u8 q1, q1 \n" \ + "vrshrn.u16 d2, q1, #1 \n" + +// Read 8 Y, and set 4 U and 4 V to 128 +#define READYUV400 \ + MEMACCESS(0) \ + "vld1.8 {d0}, [%0]! \n" \ + "vmov.u8 d2, #128 \n" + +// Read 8 Y and 4 UV from NV12 +#define READNV12 \ + MEMACCESS(0) \ + "vld1.8 {d0}, [%0]! \n" \ + MEMACCESS(1) \ + "vld1.8 {d2}, [%1]! \n" \ + "vmov.u8 d3, d2 \n"/* split odd/even uv apart */\ + "vuzp.u8 d2, d3 \n" \ + "vtrn.u32 d2, d3 \n" + +// Read 8 Y and 4 VU from NV21 +#define READNV21 \ + MEMACCESS(0) \ + "vld1.8 {d0}, [%0]! \n" \ + MEMACCESS(1) \ + "vld1.8 {d2}, [%1]! \n" \ + "vmov.u8 d3, d2 \n"/* split odd/even uv apart */\ + "vuzp.u8 d3, d2 \n" \ + "vtrn.u32 d2, d3 \n" + +// Read 8 YUY2 +#define READYUY2 \ + MEMACCESS(0) \ + "vld2.8 {d0, d2}, [%0]! \n" \ + "vmov.u8 d3, d2 \n" \ + "vuzp.u8 d2, d3 \n" \ + "vtrn.u32 d2, d3 \n" + +// Read 8 UYVY +#define READUYVY \ + MEMACCESS(0) \ + "vld2.8 {d2, d3}, [%0]! \n" \ + "vmov.u8 d0, d3 \n" \ + "vmov.u8 d3, d2 \n" \ + "vuzp.u8 d2, d3 \n" \ + "vtrn.u32 d2, d3 \n" + +#define YUV422TORGB \ + "veor.u8 d2, d26 \n"/*subtract 128 from u and v*/\ + "vmull.s8 q8, d2, d24 \n"/* u/v B/R component */\ + "vmull.s8 q9, d2, d25 \n"/* u/v G component */\ + "vmov.u8 d1, #0 \n"/* split odd/even y apart */\ + "vtrn.u8 d0, d1 \n" \ + "vsub.s16 q0, q0, q15 \n"/* offset y */\ + "vmul.s16 q0, q0, q14 \n" \ + "vadd.s16 d18, d19 \n" \ + "vqadd.s16 d20, d0, d16 \n" /* B */ \ + "vqadd.s16 d21, d1, d16 \n" \ + "vqadd.s16 d22, d0, d17 \n" /* R */ \ + "vqadd.s16 d23, d1, d17 \n" \ + "vqadd.s16 d16, d0, d18 \n" /* G */ \ + "vqadd.s16 d17, d1, d18 \n" \ + "vqshrun.s16 d0, q10, #6 \n" /* B */ \ + "vqshrun.s16 d1, q11, #6 \n" /* G */ \ + "vqshrun.s16 d2, q8, #6 \n" /* R */ \ + "vmovl.u8 q10, d0 \n"/* set up for reinterleave*/\ + "vmovl.u8 q11, d1 \n" \ + "vmovl.u8 q8, d2 \n" \ + "vtrn.u8 d20, d21 \n" \ + "vtrn.u8 d22, d23 \n" \ + "vtrn.u8 d16, d17 \n" \ + "vmov.u8 d21, d16 \n" + +static vec8 kUVToRB = { 127, 127, 127, 127, 102, 102, 102, 102, + 0, 0, 0, 0, 0, 0, 0, 0 }; +static vec8 kUVToG = { -25, -25, -25, -25, -52, -52, -52, -52, + 0, 0, 0, 0, 0, 0, 0, 0 }; + +void I444ToARGBRow_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width) { + asm volatile ( + MEMACCESS(5) + "vld1.8 {d24}, [%5] \n" + MEMACCESS(6) + "vld1.8 {d25}, [%6] \n" + "vmov.u8 d26, #128 \n" + "vmov.u16 q14, #74 \n" + "vmov.u16 q15, #16 \n" + ".p2align 2 \n" + "1: \n" + READYUV444 + YUV422TORGB + "subs %4, %4, #8 \n" + "vmov.u8 d23, #255 \n" + MEMACCESS(3) + "vst4.8 {d20, d21, d22, d23}, [%3]! \n" + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_argb), // %3 + "+r"(width) // %4 + : "r"(&kUVToRB), // %5 + "r"(&kUVToG) // %6 + : "cc", "memory", "q0", "q1", "q2", "q3", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} + +void I422ToARGBRow_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width) { + asm volatile ( + MEMACCESS(5) + "vld1.8 {d24}, [%5] \n" + MEMACCESS(6) + "vld1.8 {d25}, [%6] \n" + "vmov.u8 d26, #128 \n" + "vmov.u16 q14, #74 \n" + "vmov.u16 q15, #16 \n" + ".p2align 2 \n" + "1: \n" + READYUV422 + YUV422TORGB + "subs %4, %4, #8 \n" + "vmov.u8 d23, #255 \n" + MEMACCESS(3) + "vst4.8 {d20, d21, d22, d23}, [%3]! \n" + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_argb), // %3 + "+r"(width) // %4 + : "r"(&kUVToRB), // %5 + "r"(&kUVToG) // %6 + : "cc", "memory", "q0", "q1", "q2", "q3", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} + +void I411ToARGBRow_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width) { + asm volatile ( + MEMACCESS(5) + "vld1.8 {d24}, [%5] \n" + MEMACCESS(6) + "vld1.8 {d25}, [%6] \n" + "vmov.u8 d26, #128 \n" + "vmov.u16 q14, #74 \n" + "vmov.u16 q15, #16 \n" + ".p2align 2 \n" + "1: \n" + READYUV411 + YUV422TORGB + "subs %4, %4, #8 \n" + "vmov.u8 d23, #255 \n" + MEMACCESS(3) + "vst4.8 {d20, d21, d22, d23}, [%3]! \n" + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_argb), // %3 + "+r"(width) // %4 + : "r"(&kUVToRB), // %5 + "r"(&kUVToG) // %6 + : "cc", "memory", "q0", "q1", "q2", "q3", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} + +void I422ToBGRARow_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_bgra, + int width) { + asm volatile ( + MEMACCESS(5) + "vld1.8 {d24}, [%5] \n" + MEMACCESS(6) + "vld1.8 {d25}, [%6] \n" + "vmov.u8 d26, #128 \n" + "vmov.u16 q14, #74 \n" + "vmov.u16 q15, #16 \n" + ".p2align 2 \n" + "1: \n" + READYUV422 + YUV422TORGB + "subs %4, %4, #8 \n" + "vswp.u8 d20, d22 \n" + "vmov.u8 d19, #255 \n" + MEMACCESS(3) + "vst4.8 {d19, d20, d21, d22}, [%3]! \n" + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_bgra), // %3 + "+r"(width) // %4 + : "r"(&kUVToRB), // %5 + "r"(&kUVToG) // %6 + : "cc", "memory", "q0", "q1", "q2", "q3", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} + +void I422ToABGRRow_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_abgr, + int width) { + asm volatile ( + MEMACCESS(5) + "vld1.8 {d24}, [%5] \n" + MEMACCESS(6) + "vld1.8 {d25}, [%6] \n" + "vmov.u8 d26, #128 \n" + "vmov.u16 q14, #74 \n" + "vmov.u16 q15, #16 \n" + ".p2align 2 \n" + "1: \n" + READYUV422 + YUV422TORGB + "subs %4, %4, #8 \n" + "vswp.u8 d20, d22 \n" + "vmov.u8 d23, #255 \n" + MEMACCESS(3) + "vst4.8 {d20, d21, d22, d23}, [%3]! \n" + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_abgr), // %3 + "+r"(width) // %4 + : "r"(&kUVToRB), // %5 + "r"(&kUVToG) // %6 + : "cc", "memory", "q0", "q1", "q2", "q3", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} + +void I422ToRGBARow_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_rgba, + int width) { + asm volatile ( + MEMACCESS(5) + "vld1.8 {d24}, [%5] \n" + MEMACCESS(6) + "vld1.8 {d25}, [%6] \n" + "vmov.u8 d26, #128 \n" + "vmov.u16 q14, #74 \n" + "vmov.u16 q15, #16 \n" + ".p2align 2 \n" + "1: \n" + READYUV422 + YUV422TORGB + "subs %4, %4, #8 \n" + "vmov.u8 d19, #255 \n" + MEMACCESS(3) + "vst4.8 {d19, d20, d21, d22}, [%3]! \n" + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_rgba), // %3 + "+r"(width) // %4 + : "r"(&kUVToRB), // %5 + "r"(&kUVToG) // %6 + : "cc", "memory", "q0", "q1", "q2", "q3", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} + +void I422ToRGB24Row_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_rgb24, + int width) { + asm volatile ( + MEMACCESS(5) + "vld1.8 {d24}, [%5] \n" + MEMACCESS(6) + "vld1.8 {d25}, [%6] \n" + "vmov.u8 d26, #128 \n" + "vmov.u16 q14, #74 \n" + "vmov.u16 q15, #16 \n" + ".p2align 2 \n" + "1: \n" + READYUV422 + YUV422TORGB + "subs %4, %4, #8 \n" + MEMACCESS(3) + "vst3.8 {d20, d21, d22}, [%3]! \n" + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_rgb24), // %3 + "+r"(width) // %4 + : "r"(&kUVToRB), // %5 + "r"(&kUVToG) // %6 + : "cc", "memory", "q0", "q1", "q2", "q3", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} + +void I422ToRAWRow_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_raw, + int width) { + asm volatile ( + MEMACCESS(5) + "vld1.8 {d24}, [%5] \n" + MEMACCESS(6) + "vld1.8 {d25}, [%6] \n" + "vmov.u8 d26, #128 \n" + "vmov.u16 q14, #74 \n" + "vmov.u16 q15, #16 \n" + ".p2align 2 \n" + "1: \n" + READYUV422 + YUV422TORGB + "subs %4, %4, #8 \n" + "vswp.u8 d20, d22 \n" + MEMACCESS(3) + "vst3.8 {d20, d21, d22}, [%3]! \n" + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_raw), // %3 + "+r"(width) // %4 + : "r"(&kUVToRB), // %5 + "r"(&kUVToG) // %6 + : "cc", "memory", "q0", "q1", "q2", "q3", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} + +#define ARGBTORGB565 \ + "vshr.u8 d20, d20, #3 \n" /* B */ \ + "vshr.u8 d21, d21, #2 \n" /* G */ \ + "vshr.u8 d22, d22, #3 \n" /* R */ \ + "vmovl.u8 q8, d20 \n" /* B */ \ + "vmovl.u8 q9, d21 \n" /* G */ \ + "vmovl.u8 q10, d22 \n" /* R */ \ + "vshl.u16 q9, q9, #5 \n" /* G */ \ + "vshl.u16 q10, q10, #11 \n" /* R */ \ + "vorr q0, q8, q9 \n" /* BG */ \ + "vorr q0, q0, q10 \n" /* BGR */ + +void I422ToRGB565Row_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_rgb565, + int width) { + asm volatile ( + MEMACCESS(5) + "vld1.8 {d24}, [%5] \n" + MEMACCESS(6) + "vld1.8 {d25}, [%6] \n" + "vmov.u8 d26, #128 \n" + "vmov.u16 q14, #74 \n" + "vmov.u16 q15, #16 \n" + ".p2align 2 \n" + "1: \n" + READYUV422 + YUV422TORGB + "subs %4, %4, #8 \n" + ARGBTORGB565 + MEMACCESS(3) + "vst1.8 {q0}, [%3]! \n" // store 8 pixels RGB565. + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_rgb565), // %3 + "+r"(width) // %4 + : "r"(&kUVToRB), // %5 + "r"(&kUVToG) // %6 + : "cc", "memory", "q0", "q1", "q2", "q3", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} + +#define ARGBTOARGB1555 \ + "vshr.u8 q10, q10, #3 \n" /* B */ \ + "vshr.u8 d22, d22, #3 \n" /* R */ \ + "vshr.u8 d23, d23, #7 \n" /* A */ \ + "vmovl.u8 q8, d20 \n" /* B */ \ + "vmovl.u8 q9, d21 \n" /* G */ \ + "vmovl.u8 q10, d22 \n" /* R */ \ + "vmovl.u8 q11, d23 \n" /* A */ \ + "vshl.u16 q9, q9, #5 \n" /* G */ \ + "vshl.u16 q10, q10, #10 \n" /* R */ \ + "vshl.u16 q11, q11, #15 \n" /* A */ \ + "vorr q0, q8, q9 \n" /* BG */ \ + "vorr q1, q10, q11 \n" /* RA */ \ + "vorr q0, q0, q1 \n" /* BGRA */ + +void I422ToARGB1555Row_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb1555, + int width) { + asm volatile ( + MEMACCESS(5) + "vld1.8 {d24}, [%5] \n" + MEMACCESS(6) + "vld1.8 {d25}, [%6] \n" + "vmov.u8 d26, #128 \n" + "vmov.u16 q14, #74 \n" + "vmov.u16 q15, #16 \n" + ".p2align 2 \n" + "1: \n" + READYUV422 + YUV422TORGB + "subs %4, %4, #8 \n" + "vmov.u8 d23, #255 \n" + ARGBTOARGB1555 + MEMACCESS(3) + "vst1.8 {q0}, [%3]! \n" // store 8 pixels ARGB1555. + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_argb1555), // %3 + "+r"(width) // %4 + : "r"(&kUVToRB), // %5 + "r"(&kUVToG) // %6 + : "cc", "memory", "q0", "q1", "q2", "q3", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} + +#define ARGBTOARGB4444 \ + "vshr.u8 d20, d20, #4 \n" /* B */ \ + "vbic.32 d21, d21, d4 \n" /* G */ \ + "vshr.u8 d22, d22, #4 \n" /* R */ \ + "vbic.32 d23, d23, d4 \n" /* A */ \ + "vorr d0, d20, d21 \n" /* BG */ \ + "vorr d1, d22, d23 \n" /* RA */ \ + "vzip.u8 d0, d1 \n" /* BGRA */ + +void I422ToARGB4444Row_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb4444, + int width) { + asm volatile ( + MEMACCESS(5) + "vld1.8 {d24}, [%5] \n" + MEMACCESS(6) + "vld1.8 {d25}, [%6] \n" + "vmov.u8 d26, #128 \n" + "vmov.u16 q14, #74 \n" + "vmov.u16 q15, #16 \n" + "vmov.u8 d4, #0x0f \n" // bits to clear with vbic. + ".p2align 2 \n" + "1: \n" + READYUV422 + YUV422TORGB + "subs %4, %4, #8 \n" + "vmov.u8 d23, #255 \n" + ARGBTOARGB4444 + MEMACCESS(3) + "vst1.8 {q0}, [%3]! \n" // store 8 pixels ARGB4444. + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_argb4444), // %3 + "+r"(width) // %4 + : "r"(&kUVToRB), // %5 + "r"(&kUVToG) // %6 + : "cc", "memory", "q0", "q1", "q2", "q3", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} + +void YToARGBRow_NEON(const uint8* src_y, + uint8* dst_argb, + int width) { + asm volatile ( + MEMACCESS(3) + "vld1.8 {d24}, [%3] \n" + MEMACCESS(4) + "vld1.8 {d25}, [%4] \n" + "vmov.u8 d26, #128 \n" + "vmov.u16 q14, #74 \n" + "vmov.u16 q15, #16 \n" + ".p2align 2 \n" + "1: \n" + READYUV400 + YUV422TORGB + "subs %2, %2, #8 \n" + "vmov.u8 d23, #255 \n" + MEMACCESS(1) + "vst4.8 {d20, d21, d22, d23}, [%1]! \n" + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "r"(&kUVToRB), // %3 + "r"(&kUVToG) // %4 + : "cc", "memory", "q0", "q1", "q2", "q3", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} + +void I400ToARGBRow_NEON(const uint8* src_y, + uint8* dst_argb, + int width) { + asm volatile ( + ".p2align 2 \n" + "vmov.u8 d23, #255 \n" + "1: \n" + MEMACCESS(0) + "vld1.8 {d20}, [%0]! \n" + "vmov d21, d20 \n" + "vmov d22, d20 \n" + "subs %2, %2, #8 \n" + MEMACCESS(1) + "vst4.8 {d20, d21, d22, d23}, [%1]! \n" + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "d20", "d21", "d22", "d23" + ); +} + +void NV12ToARGBRow_NEON(const uint8* src_y, + const uint8* src_uv, + uint8* dst_argb, + int width) { + asm volatile ( + MEMACCESS(4) + "vld1.8 {d24}, [%4] \n" + MEMACCESS(5) + "vld1.8 {d25}, [%5] \n" + "vmov.u8 d26, #128 \n" + "vmov.u16 q14, #74 \n" + "vmov.u16 q15, #16 \n" + ".p2align 2 \n" + "1: \n" + READNV12 + YUV422TORGB + "subs %3, %3, #8 \n" + "vmov.u8 d23, #255 \n" + MEMACCESS(2) + "vst4.8 {d20, d21, d22, d23}, [%2]! \n" + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_uv), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : "r"(&kUVToRB), // %4 + "r"(&kUVToG) // %5 + : "cc", "memory", "q0", "q1", "q2", "q3", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} + +void NV21ToARGBRow_NEON(const uint8* src_y, + const uint8* src_uv, + uint8* dst_argb, + int width) { + asm volatile ( + MEMACCESS(4) + "vld1.8 {d24}, [%4] \n" + MEMACCESS(5) + "vld1.8 {d25}, [%5] \n" + "vmov.u8 d26, #128 \n" + "vmov.u16 q14, #74 \n" + "vmov.u16 q15, #16 \n" + ".p2align 2 \n" + "1: \n" + READNV21 + YUV422TORGB + "subs %3, %3, #8 \n" + "vmov.u8 d23, #255 \n" + MEMACCESS(2) + "vst4.8 {d20, d21, d22, d23}, [%2]! \n" + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_uv), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : "r"(&kUVToRB), // %4 + "r"(&kUVToG) // %5 + : "cc", "memory", "q0", "q1", "q2", "q3", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} + +void NV12ToRGB565Row_NEON(const uint8* src_y, + const uint8* src_uv, + uint8* dst_rgb565, + int width) { + asm volatile ( + MEMACCESS(4) + "vld1.8 {d24}, [%4] \n" + MEMACCESS(5) + "vld1.8 {d25}, [%5] \n" + "vmov.u8 d26, #128 \n" + "vmov.u16 q14, #74 \n" + "vmov.u16 q15, #16 \n" + ".p2align 2 \n" + "1: \n" + READNV12 + YUV422TORGB + "subs %3, %3, #8 \n" + ARGBTORGB565 + MEMACCESS(2) + "vst1.8 {q0}, [%2]! \n" // store 8 pixels RGB565. + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_uv), // %1 + "+r"(dst_rgb565), // %2 + "+r"(width) // %3 + : "r"(&kUVToRB), // %4 + "r"(&kUVToG) // %5 + : "cc", "memory", "q0", "q1", "q2", "q3", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} + +void NV21ToRGB565Row_NEON(const uint8* src_y, + const uint8* src_uv, + uint8* dst_rgb565, + int width) { + asm volatile ( + MEMACCESS(4) + "vld1.8 {d24}, [%4] \n" + MEMACCESS(5) + "vld1.8 {d25}, [%5] \n" + "vmov.u8 d26, #128 \n" + "vmov.u16 q14, #74 \n" + "vmov.u16 q15, #16 \n" + ".p2align 2 \n" + "1: \n" + READNV21 + YUV422TORGB + "subs %3, %3, #8 \n" + ARGBTORGB565 + MEMACCESS(2) + "vst1.8 {q0}, [%2]! \n" // store 8 pixels RGB565. + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_uv), // %1 + "+r"(dst_rgb565), // %2 + "+r"(width) // %3 + : "r"(&kUVToRB), // %4 + "r"(&kUVToG) // %5 + : "cc", "memory", "q0", "q1", "q2", "q3", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} + +void YUY2ToARGBRow_NEON(const uint8* src_yuy2, + uint8* dst_argb, + int width) { + asm volatile ( + MEMACCESS(3) + "vld1.8 {d24}, [%3] \n" + MEMACCESS(4) + "vld1.8 {d25}, [%4] \n" + "vmov.u8 d26, #128 \n" + "vmov.u16 q14, #74 \n" + "vmov.u16 q15, #16 \n" + ".p2align 2 \n" + "1: \n" + READYUY2 + YUV422TORGB + "subs %2, %2, #8 \n" + "vmov.u8 d23, #255 \n" + MEMACCESS(1) + "vst4.8 {d20, d21, d22, d23}, [%1]! \n" + "bgt 1b \n" + : "+r"(src_yuy2), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "r"(&kUVToRB), // %3 + "r"(&kUVToG) // %4 + : "cc", "memory", "q0", "q1", "q2", "q3", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} + +void UYVYToARGBRow_NEON(const uint8* src_uyvy, + uint8* dst_argb, + int width) { + asm volatile ( + MEMACCESS(3) + "vld1.8 {d24}, [%3] \n" + MEMACCESS(4) + "vld1.8 {d25}, [%4] \n" + "vmov.u8 d26, #128 \n" + "vmov.u16 q14, #74 \n" + "vmov.u16 q15, #16 \n" + ".p2align 2 \n" + "1: \n" + READUYVY + YUV422TORGB + "subs %2, %2, #8 \n" + "vmov.u8 d23, #255 \n" + MEMACCESS(1) + "vst4.8 {d20, d21, d22, d23}, [%1]! \n" + "bgt 1b \n" + : "+r"(src_uyvy), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "r"(&kUVToRB), // %3 + "r"(&kUVToG) // %4 + : "cc", "memory", "q0", "q1", "q2", "q3", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} + +// Reads 16 pairs of UV and write even values to dst_u and odd to dst_v. +void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, + int width) { + asm volatile ( + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld2.8 {q0, q1}, [%0]! \n" // load 16 pairs of UV + "subs %3, %3, #16 \n" // 16 processed per loop + MEMACCESS(1) + "vst1.8 {q0}, [%1]! \n" // store U + MEMACCESS(2) + "vst1.8 {q1}, [%2]! \n" // store V + "bgt 1b \n" + : "+r"(src_uv), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(width) // %3 // Output registers + : // Input registers + : "cc", "memory", "q0", "q1" // Clobber List + ); +} + +// Reads 16 U's and V's and writes out 16 pairs of UV. +void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv, + int width) { + asm volatile ( + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld1.8 {q0}, [%0]! \n" // load U + MEMACCESS(1) + "vld1.8 {q1}, [%1]! \n" // load V + "subs %3, %3, #16 \n" // 16 processed per loop + MEMACCESS(2) + "vst2.u8 {q0, q1}, [%2]! \n" // store 16 pairs of UV + "bgt 1b \n" + : + "+r"(src_u), // %0 + "+r"(src_v), // %1 + "+r"(dst_uv), // %2 + "+r"(width) // %3 // Output registers + : // Input registers + : "cc", "memory", "q0", "q1" // Clobber List + ); +} + +// Copy multiple of 32. vld4.8 allow unaligned and is fastest on a15. +void CopyRow_NEON(const uint8* src, uint8* dst, int count) { + asm volatile ( + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld1.8 {d0, d1, d2, d3}, [%0]! \n" // load 32 + "subs %2, %2, #32 \n" // 32 processed per loop + MEMACCESS(1) + "vst1.8 {d0, d1, d2, d3}, [%1]! \n" // store 32 + "bgt 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(count) // %2 // Output registers + : // Input registers + : "cc", "memory", "q0", "q1" // Clobber List + ); +} + +// SetRow8 writes 'count' bytes using a 32 bit value repeated. +void SetRow_NEON(uint8* dst, uint32 v32, int count) { + asm volatile ( + "vdup.u32 q0, %2 \n" // duplicate 4 ints + "1: \n" + "subs %1, %1, #16 \n" // 16 bytes per loop + MEMACCESS(0) + "vst1.8 {q0}, [%0]! \n" // store + "bgt 1b \n" + : "+r"(dst), // %0 + "+r"(count) // %1 + : "r"(v32) // %2 + : "cc", "memory", "q0" + ); +} + +// TODO(fbarchard): Make fully assembler +// SetRow32 writes 'count' words using a 32 bit value repeated. +void ARGBSetRows_NEON(uint8* dst, uint32 v32, int width, + int dst_stride, int height) { + for (int y = 0; y < height; ++y) { + SetRow_NEON(dst, v32, width << 2); + dst += dst_stride; + } +} + +void MirrorRow_NEON(const uint8* src, uint8* dst, int width) { + asm volatile ( + // Start at end of source row. + "mov r3, #-16 \n" + "add %0, %0, %2 \n" + "sub %0, #16 \n" + + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld1.8 {q0}, [%0], r3 \n" // src -= 16 + "subs %2, #16 \n" // 16 pixels per loop. + "vrev64.8 q0, q0 \n" + MEMACCESS(1) + "vst1.8 {d1}, [%1]! \n" // dst += 16 + MEMACCESS(1) + "vst1.8 {d0}, [%1]! \n" + "bgt 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "r3", "q0" + ); +} + +void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, + int width) { + asm volatile ( + // Start at end of source row. + "mov r12, #-16 \n" + "add %0, %0, %3, lsl #1 \n" + "sub %0, #16 \n" + + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld2.8 {d0, d1}, [%0], r12 \n" // src -= 16 + "subs %3, #8 \n" // 8 pixels per loop. + "vrev64.8 q0, q0 \n" + MEMACCESS(1) + "vst1.8 {d0}, [%1]! \n" // dst += 8 + MEMACCESS(2) + "vst1.8 {d1}, [%2]! \n" + "bgt 1b \n" + : "+r"(src_uv), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "r12", "q0" + ); +} + +void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) { + asm volatile ( + // Start at end of source row. + "mov r3, #-16 \n" + "add %0, %0, %2, lsl #2 \n" + "sub %0, #16 \n" + + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld1.8 {q0}, [%0], r3 \n" // src -= 16 + "subs %2, #4 \n" // 4 pixels per loop. + "vrev64.32 q0, q0 \n" + MEMACCESS(1) + "vst1.8 {d1}, [%1]! \n" // dst += 16 + MEMACCESS(1) + "vst1.8 {d0}, [%1]! \n" + "bgt 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "r3", "q0" + ); +} + +void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix) { + asm volatile ( + "vmov.u8 d4, #255 \n" // Alpha + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RGB24. + "subs %2, %2, #8 \n" // 8 processed per loop. + MEMACCESS(1) + "vst4.8 {d1, d2, d3, d4}, [%1]! \n" // store 8 pixels of ARGB. + "bgt 1b \n" + : "+r"(src_rgb24), // %0 + "+r"(dst_argb), // %1 + "+r"(pix) // %2 + : + : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List + ); +} + +void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int pix) { + asm volatile ( + "vmov.u8 d4, #255 \n" // Alpha + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vswp.u8 d1, d3 \n" // swap R, B + MEMACCESS(1) + "vst4.8 {d1, d2, d3, d4}, [%1]! \n" // store 8 pixels of ARGB. + "bgt 1b \n" + : "+r"(src_raw), // %0 + "+r"(dst_argb), // %1 + "+r"(pix) // %2 + : + : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List + ); +} + +#define RGB565TOARGB \ + "vshrn.u16 d6, q0, #5 \n" /* G xxGGGGGG */ \ + "vuzp.u8 d0, d1 \n" /* d0 xxxBBBBB RRRRRxxx */ \ + "vshl.u8 d6, d6, #2 \n" /* G GGGGGG00 upper 6 */ \ + "vshr.u8 d1, d1, #3 \n" /* R 000RRRRR lower 5 */ \ + "vshl.u8 q0, q0, #3 \n" /* B,R BBBBB000 upper 5 */ \ + "vshr.u8 q2, q0, #5 \n" /* B,R 00000BBB lower 3 */ \ + "vorr.u8 d0, d0, d4 \n" /* B */ \ + "vshr.u8 d4, d6, #6 \n" /* G 000000GG lower 2 */ \ + "vorr.u8 d2, d1, d5 \n" /* R */ \ + "vorr.u8 d1, d4, d6 \n" /* G */ + +void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int pix) { + asm volatile ( + "vmov.u8 d3, #255 \n" // Alpha + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels. + "subs %2, %2, #8 \n" // 8 processed per loop. + RGB565TOARGB + MEMACCESS(1) + "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB. + "bgt 1b \n" + : "+r"(src_rgb565), // %0 + "+r"(dst_argb), // %1 + "+r"(pix) // %2 + : + : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List + ); +} + +#define ARGB1555TOARGB \ + "vshrn.u16 d7, q0, #8 \n" /* A Arrrrrxx */ \ + "vshr.u8 d6, d7, #2 \n" /* R xxxRRRRR */ \ + "vshrn.u16 d5, q0, #5 \n" /* G xxxGGGGG */ \ + "vmovn.u16 d4, q0 \n" /* B xxxBBBBB */ \ + "vshr.u8 d7, d7, #7 \n" /* A 0000000A */ \ + "vneg.s8 d7, d7 \n" /* A AAAAAAAA upper 8 */ \ + "vshl.u8 d6, d6, #3 \n" /* R RRRRR000 upper 5 */ \ + "vshr.u8 q1, q3, #5 \n" /* R,A 00000RRR lower 3 */ \ + "vshl.u8 q0, q2, #3 \n" /* B,G BBBBB000 upper 5 */ \ + "vshr.u8 q2, q0, #5 \n" /* B,G 00000BBB lower 3 */ \ + "vorr.u8 q1, q1, q3 \n" /* R,A */ \ + "vorr.u8 q0, q0, q2 \n" /* B,G */ \ + +// RGB555TOARGB is same as ARGB1555TOARGB but ignores alpha. +#define RGB555TOARGB \ + "vshrn.u16 d6, q0, #5 \n" /* G xxxGGGGG */ \ + "vuzp.u8 d0, d1 \n" /* d0 xxxBBBBB xRRRRRxx */ \ + "vshl.u8 d6, d6, #3 \n" /* G GGGGG000 upper 5 */ \ + "vshr.u8 d1, d1, #2 \n" /* R 00xRRRRR lower 5 */ \ + "vshl.u8 q0, q0, #3 \n" /* B,R BBBBB000 upper 5 */ \ + "vshr.u8 q2, q0, #5 \n" /* B,R 00000BBB lower 3 */ \ + "vorr.u8 d0, d0, d4 \n" /* B */ \ + "vshr.u8 d4, d6, #5 \n" /* G 00000GGG lower 3 */ \ + "vorr.u8 d2, d1, d5 \n" /* R */ \ + "vorr.u8 d1, d4, d6 \n" /* G */ + +void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, uint8* dst_argb, + int pix) { + asm volatile ( + "vmov.u8 d3, #255 \n" // Alpha + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels. + "subs %2, %2, #8 \n" // 8 processed per loop. + ARGB1555TOARGB + MEMACCESS(1) + "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB. + "bgt 1b \n" + : "+r"(src_argb1555), // %0 + "+r"(dst_argb), // %1 + "+r"(pix) // %2 + : + : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List + ); +} + +#define ARGB4444TOARGB \ + "vuzp.u8 d0, d1 \n" /* d0 BG, d1 RA */ \ + "vshl.u8 q2, q0, #4 \n" /* B,R BBBB0000 */ \ + "vshr.u8 q1, q0, #4 \n" /* G,A 0000GGGG */ \ + "vshr.u8 q0, q2, #4 \n" /* B,R 0000BBBB */ \ + "vorr.u8 q0, q0, q2 \n" /* B,R BBBBBBBB */ \ + "vshl.u8 q2, q1, #4 \n" /* G,A GGGG0000 */ \ + "vorr.u8 q1, q1, q2 \n" /* G,A GGGGGGGG */ \ + "vswp.u8 d1, d2 \n" /* B,R,G,A -> B,G,R,A */ + +void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, uint8* dst_argb, + int pix) { + asm volatile ( + "vmov.u8 d3, #255 \n" // Alpha + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels. + "subs %2, %2, #8 \n" // 8 processed per loop. + ARGB4444TOARGB + MEMACCESS(1) + "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB. + "bgt 1b \n" + : "+r"(src_argb4444), // %0 + "+r"(dst_argb), // %1 + "+r"(pix) // %2 + : + : "cc", "memory", "q0", "q1", "q2" // Clobber List + ); +} + +void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int pix) { + asm volatile ( + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB. + "subs %2, %2, #8 \n" // 8 processed per loop. + MEMACCESS(1) + "vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of RGB24. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_rgb24), // %1 + "+r"(pix) // %2 + : + : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List + ); +} + +void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int pix) { + asm volatile ( + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vswp.u8 d1, d3 \n" // swap R, B + MEMACCESS(1) + "vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of RAW. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_raw), // %1 + "+r"(pix) // %2 + : + : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List + ); +} + +void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int pix) { + asm volatile ( + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of YUY2. + "subs %2, %2, #16 \n" // 16 processed per loop. + MEMACCESS(1) + "vst1.8 {q0}, [%1]! \n" // store 16 pixels of Y. + "bgt 1b \n" + : "+r"(src_yuy2), // %0 + "+r"(dst_y), // %1 + "+r"(pix) // %2 + : + : "cc", "memory", "q0", "q1" // Clobber List + ); +} + +void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int pix) { + asm volatile ( + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of UYVY. + "subs %2, %2, #16 \n" // 16 processed per loop. + MEMACCESS(1) + "vst1.8 {q1}, [%1]! \n" // store 16 pixels of Y. + "bgt 1b \n" + : "+r"(src_uyvy), // %0 + "+r"(dst_y), // %1 + "+r"(pix) // %2 + : + : "cc", "memory", "q0", "q1" // Clobber List + ); +} + +void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v, + int pix) { + asm volatile ( + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2. + "subs %3, %3, #16 \n" // 16 pixels = 8 UVs. + MEMACCESS(1) + "vst1.8 {d1}, [%1]! \n" // store 8 U. + MEMACCESS(2) + "vst1.8 {d3}, [%2]! \n" // store 8 V. + "bgt 1b \n" + : "+r"(src_yuy2), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(pix) // %3 + : + : "cc", "memory", "d0", "d1", "d2", "d3" // Clobber List + ); +} + +void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v, + int pix) { + asm volatile ( + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY. + "subs %3, %3, #16 \n" // 16 pixels = 8 UVs. + MEMACCESS(1) + "vst1.8 {d0}, [%1]! \n" // store 8 U. + MEMACCESS(2) + "vst1.8 {d2}, [%2]! \n" // store 8 V. + "bgt 1b \n" + : "+r"(src_uyvy), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(pix) // %3 + : + : "cc", "memory", "d0", "d1", "d2", "d3" // Clobber List + ); +} + +void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2, + uint8* dst_u, uint8* dst_v, int pix) { + asm volatile ( + "add %1, %0, %1 \n" // stride + src_yuy2 + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2. + "subs %4, %4, #16 \n" // 16 pixels = 8 UVs. + MEMACCESS(1) + "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row YUY2. + "vrhadd.u8 d1, d1, d5 \n" // average rows of U + "vrhadd.u8 d3, d3, d7 \n" // average rows of V + MEMACCESS(2) + "vst1.8 {d1}, [%2]! \n" // store 8 U. + MEMACCESS(3) + "vst1.8 {d3}, [%3]! \n" // store 8 V. + "bgt 1b \n" + : "+r"(src_yuy2), // %0 + "+r"(stride_yuy2), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(pix) // %4 + : + : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7" // Clobber List + ); +} + +void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy, + uint8* dst_u, uint8* dst_v, int pix) { + asm volatile ( + "add %1, %0, %1 \n" // stride + src_uyvy + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY. + "subs %4, %4, #16 \n" // 16 pixels = 8 UVs. + MEMACCESS(1) + "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row UYVY. + "vrhadd.u8 d0, d0, d4 \n" // average rows of U + "vrhadd.u8 d2, d2, d6 \n" // average rows of V + MEMACCESS(2) + "vst1.8 {d0}, [%2]! \n" // store 8 U. + MEMACCESS(3) + "vst1.8 {d2}, [%3]! \n" // store 8 V. + "bgt 1b \n" + : "+r"(src_uyvy), // %0 + "+r"(stride_uyvy), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(pix) // %4 + : + : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7" // Clobber List + ); +} + +void HalfRow_NEON(const uint8* src_uv, int src_uv_stride, + uint8* dst_uv, int pix) { + asm volatile ( + // change the stride to row 2 pointer + "add %1, %0 \n" + "1: \n" + MEMACCESS(0) + "vld1.8 {q0}, [%0]! \n" // load row 1 16 pixels. + "subs %3, %3, #16 \n" // 16 processed per loop + MEMACCESS(1) + "vld1.8 {q1}, [%1]! \n" // load row 2 16 pixels. + "vrhadd.u8 q0, q1 \n" // average row 1 and 2 + MEMACCESS(2) + "vst1.8 {q0}, [%2]! \n" + "bgt 1b \n" + : "+r"(src_uv), // %0 + "+r"(src_uv_stride), // %1 + "+r"(dst_uv), // %2 + "+r"(pix) // %3 + : + : "cc", "memory", "q0", "q1" // Clobber List + ); +} + +// Select 2 channels from ARGB on alternating pixels. e.g. BGBGBGBG +void ARGBToBayerRow_NEON(const uint8* src_argb, uint8* dst_bayer, + uint32 selector, int pix) { + asm volatile ( + "vmov.u32 d6[0], %3 \n" // selector + "1: \n" + MEMACCESS(0) + "vld1.8 {q0, q1}, [%0]! \n" // load row 8 pixels. + "subs %2, %2, #8 \n" // 8 processed per loop + "vtbl.8 d4, {d0, d1}, d6 \n" // look up 4 pixels + "vtbl.8 d5, {d2, d3}, d6 \n" // look up 4 pixels + "vtrn.u32 d4, d5 \n" // combine 8 pixels + MEMACCESS(1) + "vst1.8 {d4}, [%1]! \n" // store 8. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_bayer), // %1 + "+r"(pix) // %2 + : "r"(selector) // %3 + : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List + ); +} + +// Select G channels from ARGB. e.g. GGGGGGGG +void ARGBToBayerGGRow_NEON(const uint8* src_argb, uint8* dst_bayer, + uint32 /*selector*/, int pix) { + asm volatile ( + "1: \n" + MEMACCESS(0) + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load row 8 pixels. + "subs %2, %2, #8 \n" // 8 processed per loop + MEMACCESS(1) + "vst1.8 {d1}, [%1]! \n" // store 8 G's. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_bayer), // %1 + "+r"(pix) // %2 + : + : "cc", "memory", "q0", "q1" // Clobber List + ); +} + +// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. +void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb, + const uint8* shuffler, int pix) { + asm volatile ( + MEMACCESS(3) + "vld1.8 {q2}, [%3] \n" // shuffler + "1: \n" + MEMACCESS(0) + "vld1.8 {q0}, [%0]! \n" // load 4 pixels. + "subs %2, %2, #4 \n" // 4 processed per loop + "vtbl.8 d2, {d0, d1}, d4 \n" // look up 2 first pixels + "vtbl.8 d3, {d0, d1}, d5 \n" // look up 2 next pixels + MEMACCESS(1) + "vst1.8 {q1}, [%1]! \n" // store 4. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(pix) // %2 + : "r"(shuffler) // %3 + : "cc", "memory", "q0", "q1", "q2" // Clobber List + ); +} + +void I422ToYUY2Row_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_yuy2, int width) { + asm volatile ( + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld2.8 {d0, d2}, [%0]! \n" // load 16 Ys + MEMACCESS(1) + "vld1.8 {d1}, [%1]! \n" // load 8 Us + MEMACCESS(2) + "vld1.8 {d3}, [%2]! \n" // load 8 Vs + "subs %4, %4, #16 \n" // 16 pixels + MEMACCESS(3) + "vst4.8 {d0, d1, d2, d3}, [%3]! \n" // Store 8 YUY2/16 pixels. + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_yuy2), // %3 + "+r"(width) // %4 + : + : "cc", "memory", "d0", "d1", "d2", "d3" + ); +} + +void I422ToUYVYRow_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_uyvy, int width) { + asm volatile ( + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld2.8 {d1, d3}, [%0]! \n" // load 16 Ys + MEMACCESS(1) + "vld1.8 {d0}, [%1]! \n" // load 8 Us + MEMACCESS(2) + "vld1.8 {d2}, [%2]! \n" // load 8 Vs + "subs %4, %4, #16 \n" // 16 pixels + MEMACCESS(3) + "vst4.8 {d0, d1, d2, d3}, [%3]! \n" // Store 8 UYVY/16 pixels. + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_uyvy), // %3 + "+r"(width) // %4 + : + : "cc", "memory", "d0", "d1", "d2", "d3" + ); +} + +void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int pix) { + asm volatile ( + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB. + "subs %2, %2, #8 \n" // 8 processed per loop. + ARGBTORGB565 + MEMACCESS(1) + "vst1.8 {q0}, [%1]! \n" // store 8 pixels RGB565. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_rgb565), // %1 + "+r"(pix) // %2 + : + : "cc", "memory", "q0", "q8", "q9", "q10", "q11" + ); +} + +void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_argb1555, + int pix) { + asm volatile ( + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB. + "subs %2, %2, #8 \n" // 8 processed per loop. + ARGBTOARGB1555 + MEMACCESS(1) + "vst1.8 {q0}, [%1]! \n" // store 8 pixels ARGB1555. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb1555), // %1 + "+r"(pix) // %2 + : + : "cc", "memory", "q0", "q8", "q9", "q10", "q11" + ); +} + +void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_argb4444, + int pix) { + asm volatile ( + "vmov.u8 d4, #0x0f \n" // bits to clear with vbic. + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB. + "subs %2, %2, #8 \n" // 8 processed per loop. + ARGBTOARGB4444 + MEMACCESS(1) + "vst1.8 {q0}, [%1]! \n" // store 8 pixels ARGB4444. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb4444), // %1 + "+r"(pix) // %2 + : + : "cc", "memory", "q0", "q8", "q9", "q10", "q11" + ); +} + +void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) { + asm volatile ( + "vmov.u8 d24, #13 \n" // B * 0.1016 coefficient + "vmov.u8 d25, #65 \n" // G * 0.5078 coefficient + "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient + "vmov.u8 d27, #16 \n" // Add 16 constant + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vmull.u8 q2, d0, d24 \n" // B + "vmlal.u8 q2, d1, d25 \n" // G + "vmlal.u8 q2, d2, d26 \n" // R + "vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y + "vqadd.u8 d0, d27 \n" + MEMACCESS(1) + "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_y), // %1 + "+r"(pix) // %2 + : + : "cc", "memory", "q0", "q1", "q2", "q12", "q13" + ); +} + +void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) { + asm volatile ( + "vmov.u8 d24, #15 \n" // B * 0.11400 coefficient + "vmov.u8 d25, #75 \n" // G * 0.58700 coefficient + "vmov.u8 d26, #38 \n" // R * 0.29900 coefficient + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vmull.u8 q2, d0, d24 \n" // B + "vmlal.u8 q2, d1, d25 \n" // G + "vmlal.u8 q2, d2, d26 \n" // R + "vqrshrun.s16 d0, q2, #7 \n" // 15 bit to 8 bit Y + MEMACCESS(1) + "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_y), // %1 + "+r"(pix) // %2 + : + : "cc", "memory", "q0", "q1", "q2", "q12", "q13" + ); +} + +// 8x1 pixels. +void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, + int pix) { + asm volatile ( + "vmov.u8 d24, #112 \n" // UB / VR 0.875 coefficient + "vmov.u8 d25, #74 \n" // UG -0.5781 coefficient + "vmov.u8 d26, #38 \n" // UR -0.2969 coefficient + "vmov.u8 d27, #18 \n" // VB -0.1406 coefficient + "vmov.u8 d28, #94 \n" // VG -0.7344 coefficient + "vmov.u16 q15, #0x8080 \n" // 128.5 + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. + "subs %3, %3, #8 \n" // 8 processed per loop. + "vmull.u8 q2, d0, d24 \n" // B + "vmlsl.u8 q2, d1, d25 \n" // G + "vmlsl.u8 q2, d2, d26 \n" // R + "vadd.u16 q2, q2, q15 \n" // +128 -> unsigned + + "vmull.u8 q3, d2, d24 \n" // R + "vmlsl.u8 q3, d1, d28 \n" // G + "vmlsl.u8 q3, d0, d27 \n" // B + "vadd.u16 q3, q3, q15 \n" // +128 -> unsigned + + "vqshrn.u16 d0, q2, #8 \n" // 16 bit to 8 bit U + "vqshrn.u16 d1, q3, #8 \n" // 16 bit to 8 bit V + + MEMACCESS(1) + "vst1.8 {d0}, [%1]! \n" // store 8 pixels U. + MEMACCESS(2) + "vst1.8 {d1}, [%2]! \n" // store 8 pixels V. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(pix) // %3 + : + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q12", "q13", "q14", "q15" + ); +} + +// 16x1 pixels -> 8x1. pix is number of argb pixels. e.g. 16. +void ARGBToUV422Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, + int pix) { + asm volatile ( + "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient + "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient + "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient + "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient + "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient + "vmov.u16 q15, #0x8080 \n" // 128.5 + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. + MEMACCESS(0) + "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels. + + "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. + "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. + "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts. + + "subs %3, %3, #16 \n" // 16 processed per loop. + "vmul.s16 q8, q0, q10 \n" // B + "vmls.s16 q8, q1, q11 \n" // G + "vmls.s16 q8, q2, q12 \n" // R + "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned + + "vmul.s16 q9, q2, q10 \n" // R + "vmls.s16 q9, q1, q14 \n" // G + "vmls.s16 q9, q0, q13 \n" // B + "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned + + "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U + "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V + + MEMACCESS(1) + "vst1.8 {d0}, [%1]! \n" // store 8 pixels U. + MEMACCESS(2) + "vst1.8 {d1}, [%2]! \n" // store 8 pixels V. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(pix) // %3 + : + : "cc", "memory", "q0", "q1", "q2", "q3", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} + +// 32x1 pixels -> 8x1. pix is number of argb pixels. e.g. 32. +void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, + int pix) { + asm volatile ( + "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient + "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient + "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient + "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient + "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient + "vmov.u16 q15, #0x8080 \n" // 128.5 + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. + MEMACCESS(0) + "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels. + "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. + "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. + "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts. + MEMACCESS(0) + "vld4.8 {d8, d10, d12, d14}, [%0]! \n" // load 8 more ARGB pixels. + MEMACCESS(0) + "vld4.8 {d9, d11, d13, d15}, [%0]! \n" // load last 8 ARGB pixels. + "vpaddl.u8 q4, q4 \n" // B 16 bytes -> 8 shorts. + "vpaddl.u8 q5, q5 \n" // G 16 bytes -> 8 shorts. + "vpaddl.u8 q6, q6 \n" // R 16 bytes -> 8 shorts. + + "vpadd.u16 d0, d0, d1 \n" // B 16 shorts -> 8 shorts. + "vpadd.u16 d1, d8, d9 \n" // B + "vpadd.u16 d2, d2, d3 \n" // G 16 shorts -> 8 shorts. + "vpadd.u16 d3, d10, d11 \n" // G + "vpadd.u16 d4, d4, d5 \n" // R 16 shorts -> 8 shorts. + "vpadd.u16 d5, d12, d13 \n" // R + + "vrshr.u16 q0, q0, #1 \n" // 2x average + "vrshr.u16 q1, q1, #1 \n" + "vrshr.u16 q2, q2, #1 \n" + + "subs %3, %3, #32 \n" // 32 processed per loop. + "vmul.s16 q8, q0, q10 \n" // B + "vmls.s16 q8, q1, q11 \n" // G + "vmls.s16 q8, q2, q12 \n" // R + "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned + "vmul.s16 q9, q2, q10 \n" // R + "vmls.s16 q9, q1, q14 \n" // G + "vmls.s16 q9, q0, q13 \n" // B + "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned + "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U + "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V + MEMACCESS(1) + "vst1.8 {d0}, [%1]! \n" // store 8 pixels U. + MEMACCESS(2) + "vst1.8 {d1}, [%2]! \n" // store 8 pixels V. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(pix) // %3 + : + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} + +// 16x2 pixels -> 8x1. pix is number of argb pixels. e.g. 16. +#define RGBTOUV(QB, QG, QR) \ + "vmul.s16 q8, " #QB ", q10 \n" /* B */ \ + "vmls.s16 q8, " #QG ", q11 \n" /* G */ \ + "vmls.s16 q8, " #QR ", q12 \n" /* R */ \ + "vadd.u16 q8, q8, q15 \n" /* +128 -> unsigned */ \ + "vmul.s16 q9, " #QR ", q10 \n" /* R */ \ + "vmls.s16 q9, " #QG ", q14 \n" /* G */ \ + "vmls.s16 q9, " #QB ", q13 \n" /* B */ \ + "vadd.u16 q9, q9, q15 \n" /* +128 -> unsigned */ \ + "vqshrn.u16 d0, q8, #8 \n" /* 16 bit to 8 bit U */ \ + "vqshrn.u16 d1, q9, #8 \n" /* 16 bit to 8 bit V */ + +// TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr. +void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int pix) { + asm volatile ( + "add %1, %0, %1 \n" // src_stride + src_argb + "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient + "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient + "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient + "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient + "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient + "vmov.u16 q15, #0x8080 \n" // 128.5 + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. + MEMACCESS(0) + "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels. + "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. + "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. + "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts. + MEMACCESS(1) + "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ARGB pixels. + MEMACCESS(1) + "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ARGB pixels. + "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts. + "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. + "vpadal.u8 q2, q6 \n" // R 16 bytes -> 8 shorts. + + "vrshr.u16 q0, q0, #1 \n" // 2x average + "vrshr.u16 q1, q1, #1 \n" + "vrshr.u16 q2, q2, #1 \n" + + "subs %4, %4, #16 \n" // 32 processed per loop. + RGBTOUV(q0, q1, q2) + MEMACCESS(2) + "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. + MEMACCESS(3) + "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(src_stride_argb), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(pix) // %4 + : + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} + +// TODO(fbarchard): Subsample match C code. +void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int pix) { + asm volatile ( + "add %1, %0, %1 \n" // src_stride + src_argb + "vmov.s16 q10, #127 / 2 \n" // UB / VR 0.500 coefficient + "vmov.s16 q11, #84 / 2 \n" // UG -0.33126 coefficient + "vmov.s16 q12, #43 / 2 \n" // UR -0.16874 coefficient + "vmov.s16 q13, #20 / 2 \n" // VB -0.08131 coefficient + "vmov.s16 q14, #107 / 2 \n" // VG -0.41869 coefficient + "vmov.u16 q15, #0x8080 \n" // 128.5 + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. + MEMACCESS(0) + "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels. + "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. + "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. + "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts. + MEMACCESS(1) + "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ARGB pixels. + MEMACCESS(1) + "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ARGB pixels. + "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts. + "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. + "vpadal.u8 q2, q6 \n" // R 16 bytes -> 8 shorts. + + "vrshr.u16 q0, q0, #1 \n" // 2x average + "vrshr.u16 q1, q1, #1 \n" + "vrshr.u16 q2, q2, #1 \n" + + "subs %4, %4, #16 \n" // 32 processed per loop. + RGBTOUV(q0, q1, q2) + MEMACCESS(2) + "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. + MEMACCESS(3) + "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(src_stride_argb), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(pix) // %4 + : + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} + +void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra, + uint8* dst_u, uint8* dst_v, int pix) { + asm volatile ( + "add %1, %0, %1 \n" // src_stride + src_bgra + "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient + "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient + "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient + "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient + "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient + "vmov.u16 q15, #0x8080 \n" // 128.5 + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 BGRA pixels. + MEMACCESS(0) + "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 BGRA pixels. + "vpaddl.u8 q3, q3 \n" // B 16 bytes -> 8 shorts. + "vpaddl.u8 q2, q2 \n" // G 16 bytes -> 8 shorts. + "vpaddl.u8 q1, q1 \n" // R 16 bytes -> 8 shorts. + MEMACCESS(1) + "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more BGRA pixels. + MEMACCESS(1) + "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 BGRA pixels. + "vpadal.u8 q3, q7 \n" // B 16 bytes -> 8 shorts. + "vpadal.u8 q2, q6 \n" // G 16 bytes -> 8 shorts. + "vpadal.u8 q1, q5 \n" // R 16 bytes -> 8 shorts. + + "vrshr.u16 q1, q1, #1 \n" // 2x average + "vrshr.u16 q2, q2, #1 \n" + "vrshr.u16 q3, q3, #1 \n" + + "subs %4, %4, #16 \n" // 32 processed per loop. + RGBTOUV(q3, q2, q1) + MEMACCESS(2) + "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. + MEMACCESS(3) + "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. + "bgt 1b \n" + : "+r"(src_bgra), // %0 + "+r"(src_stride_bgra), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(pix) // %4 + : + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} + +void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr, + uint8* dst_u, uint8* dst_v, int pix) { + asm volatile ( + "add %1, %0, %1 \n" // src_stride + src_abgr + "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient + "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient + "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient + "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient + "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient + "vmov.u16 q15, #0x8080 \n" // 128.5 + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ABGR pixels. + MEMACCESS(0) + "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ABGR pixels. + "vpaddl.u8 q2, q2 \n" // B 16 bytes -> 8 shorts. + "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. + "vpaddl.u8 q0, q0 \n" // R 16 bytes -> 8 shorts. + MEMACCESS(1) + "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ABGR pixels. + MEMACCESS(1) + "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ABGR pixels. + "vpadal.u8 q2, q6 \n" // B 16 bytes -> 8 shorts. + "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. + "vpadal.u8 q0, q4 \n" // R 16 bytes -> 8 shorts. + + "vrshr.u16 q0, q0, #1 \n" // 2x average + "vrshr.u16 q1, q1, #1 \n" + "vrshr.u16 q2, q2, #1 \n" + + "subs %4, %4, #16 \n" // 32 processed per loop. + RGBTOUV(q2, q1, q0) + MEMACCESS(2) + "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. + MEMACCESS(3) + "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. + "bgt 1b \n" + : "+r"(src_abgr), // %0 + "+r"(src_stride_abgr), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(pix) // %4 + : + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} + +void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba, + uint8* dst_u, uint8* dst_v, int pix) { + asm volatile ( + "add %1, %0, %1 \n" // src_stride + src_rgba + "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient + "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient + "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient + "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient + "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient + "vmov.u16 q15, #0x8080 \n" // 128.5 + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 RGBA pixels. + MEMACCESS(0) + "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 RGBA pixels. + "vpaddl.u8 q0, q1 \n" // B 16 bytes -> 8 shorts. + "vpaddl.u8 q1, q2 \n" // G 16 bytes -> 8 shorts. + "vpaddl.u8 q2, q3 \n" // R 16 bytes -> 8 shorts. + MEMACCESS(1) + "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more RGBA pixels. + MEMACCESS(1) + "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 RGBA pixels. + "vpadal.u8 q0, q5 \n" // B 16 bytes -> 8 shorts. + "vpadal.u8 q1, q6 \n" // G 16 bytes -> 8 shorts. + "vpadal.u8 q2, q7 \n" // R 16 bytes -> 8 shorts. + + "vrshr.u16 q0, q0, #1 \n" // 2x average + "vrshr.u16 q1, q1, #1 \n" + "vrshr.u16 q2, q2, #1 \n" + + "subs %4, %4, #16 \n" // 32 processed per loop. + RGBTOUV(q0, q1, q2) + MEMACCESS(2) + "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. + MEMACCESS(3) + "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. + "bgt 1b \n" + : "+r"(src_rgba), // %0 + "+r"(src_stride_rgba), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(pix) // %4 + : + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} + +void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24, + uint8* dst_u, uint8* dst_v, int pix) { + asm volatile ( + "add %1, %0, %1 \n" // src_stride + src_rgb24 + "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient + "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient + "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient + "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient + "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient + "vmov.u16 q15, #0x8080 \n" // 128.5 + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RGB24 pixels. + MEMACCESS(0) + "vld3.8 {d1, d3, d5}, [%0]! \n" // load next 8 RGB24 pixels. + "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. + "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. + "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts. + MEMACCESS(1) + "vld3.8 {d8, d10, d12}, [%1]! \n" // load 8 more RGB24 pixels. + MEMACCESS(1) + "vld3.8 {d9, d11, d13}, [%1]! \n" // load last 8 RGB24 pixels. + "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts. + "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. + "vpadal.u8 q2, q6 \n" // R 16 bytes -> 8 shorts. + + "vrshr.u16 q0, q0, #1 \n" // 2x average + "vrshr.u16 q1, q1, #1 \n" + "vrshr.u16 q2, q2, #1 \n" + + "subs %4, %4, #16 \n" // 32 processed per loop. + RGBTOUV(q0, q1, q2) + MEMACCESS(2) + "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. + MEMACCESS(3) + "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. + "bgt 1b \n" + : "+r"(src_rgb24), // %0 + "+r"(src_stride_rgb24), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(pix) // %4 + : + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} + +void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw, + uint8* dst_u, uint8* dst_v, int pix) { + asm volatile ( + "add %1, %0, %1 \n" // src_stride + src_raw + "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient + "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient + "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient + "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient + "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient + "vmov.u16 q15, #0x8080 \n" // 128.5 + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RAW pixels. + MEMACCESS(0) + "vld3.8 {d1, d3, d5}, [%0]! \n" // load next 8 RAW pixels. + "vpaddl.u8 q2, q2 \n" // B 16 bytes -> 8 shorts. + "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. + "vpaddl.u8 q0, q0 \n" // R 16 bytes -> 8 shorts. + MEMACCESS(1) + "vld3.8 {d8, d10, d12}, [%1]! \n" // load 8 more RAW pixels. + MEMACCESS(1) + "vld3.8 {d9, d11, d13}, [%1]! \n" // load last 8 RAW pixels. + "vpadal.u8 q2, q6 \n" // B 16 bytes -> 8 shorts. + "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. + "vpadal.u8 q0, q4 \n" // R 16 bytes -> 8 shorts. + + "vrshr.u16 q0, q0, #1 \n" // 2x average + "vrshr.u16 q1, q1, #1 \n" + "vrshr.u16 q2, q2, #1 \n" + + "subs %4, %4, #16 \n" // 32 processed per loop. + RGBTOUV(q2, q1, q0) + MEMACCESS(2) + "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. + MEMACCESS(3) + "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. + "bgt 1b \n" + : "+r"(src_raw), // %0 + "+r"(src_stride_raw), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(pix) // %4 + : + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} + +// 16x2 pixels -> 8x1. pix is number of argb pixels. e.g. 16. +void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565, + uint8* dst_u, uint8* dst_v, int pix) { + asm volatile ( + "add %1, %0, %1 \n" // src_stride + src_argb + "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient + "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient + "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient + "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient + "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient + "vmov.u16 q15, #0x8080 \n" // 128.5 + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels. + RGB565TOARGB + "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. + "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. + "vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. + MEMACCESS(0) + "vld1.8 {q0}, [%0]! \n" // next 8 RGB565 pixels. + RGB565TOARGB + "vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. + "vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts. + "vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts. + + MEMACCESS(1) + "vld1.8 {q0}, [%1]! \n" // load 8 RGB565 pixels. + RGB565TOARGB + "vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. + "vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. + "vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. + MEMACCESS(1) + "vld1.8 {q0}, [%1]! \n" // next 8 RGB565 pixels. + RGB565TOARGB + "vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. + "vpadal.u8 d11, d1 \n" // G 8 bytes -> 4 shorts. + "vpadal.u8 d13, d2 \n" // R 8 bytes -> 4 shorts. + + "vrshr.u16 q4, q4, #1 \n" // 2x average + "vrshr.u16 q5, q5, #1 \n" + "vrshr.u16 q6, q6, #1 \n" + + "subs %4, %4, #16 \n" // 16 processed per loop. + "vmul.s16 q8, q4, q10 \n" // B + "vmls.s16 q8, q5, q11 \n" // G + "vmls.s16 q8, q6, q12 \n" // R + "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned + "vmul.s16 q9, q6, q10 \n" // R + "vmls.s16 q9, q5, q14 \n" // G + "vmls.s16 q9, q4, q13 \n" // B + "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned + "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U + "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V + MEMACCESS(2) + "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. + MEMACCESS(3) + "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. + "bgt 1b \n" + : "+r"(src_rgb565), // %0 + "+r"(src_stride_rgb565), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(pix) // %4 + : + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} + +// 16x2 pixels -> 8x1. pix is number of argb pixels. e.g. 16. +void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555, + uint8* dst_u, uint8* dst_v, int pix) { + asm volatile ( + "add %1, %0, %1 \n" // src_stride + src_argb + "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient + "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient + "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient + "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient + "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient + "vmov.u16 q15, #0x8080 \n" // 128.5 + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels. + RGB555TOARGB + "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. + "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. + "vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. + MEMACCESS(0) + "vld1.8 {q0}, [%0]! \n" // next 8 ARGB1555 pixels. + RGB555TOARGB + "vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. + "vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts. + "vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts. + + MEMACCESS(1) + "vld1.8 {q0}, [%1]! \n" // load 8 ARGB1555 pixels. + RGB555TOARGB + "vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. + "vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. + "vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. + MEMACCESS(1) + "vld1.8 {q0}, [%1]! \n" // next 8 ARGB1555 pixels. + RGB555TOARGB + "vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. + "vpadal.u8 d11, d1 \n" // G 8 bytes -> 4 shorts. + "vpadal.u8 d13, d2 \n" // R 8 bytes -> 4 shorts. + + "vrshr.u16 q4, q4, #1 \n" // 2x average + "vrshr.u16 q5, q5, #1 \n" + "vrshr.u16 q6, q6, #1 \n" + + "subs %4, %4, #16 \n" // 16 processed per loop. + "vmul.s16 q8, q4, q10 \n" // B + "vmls.s16 q8, q5, q11 \n" // G + "vmls.s16 q8, q6, q12 \n" // R + "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned + "vmul.s16 q9, q6, q10 \n" // R + "vmls.s16 q9, q5, q14 \n" // G + "vmls.s16 q9, q4, q13 \n" // B + "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned + "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U + "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V + MEMACCESS(2) + "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. + MEMACCESS(3) + "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. + "bgt 1b \n" + : "+r"(src_argb1555), // %0 + "+r"(src_stride_argb1555), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(pix) // %4 + : + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} + +// 16x2 pixels -> 8x1. pix is number of argb pixels. e.g. 16. +void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, int src_stride_argb4444, + uint8* dst_u, uint8* dst_v, int pix) { + asm volatile ( + "add %1, %0, %1 \n" // src_stride + src_argb + "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient + "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient + "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient + "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient + "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient + "vmov.u16 q15, #0x8080 \n" // 128.5 + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels. + ARGB4444TOARGB + "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. + "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. + "vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. + MEMACCESS(0) + "vld1.8 {q0}, [%0]! \n" // next 8 ARGB4444 pixels. + ARGB4444TOARGB + "vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. + "vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts. + "vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts. + + MEMACCESS(1) + "vld1.8 {q0}, [%1]! \n" // load 8 ARGB4444 pixels. + ARGB4444TOARGB + "vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. + "vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. + "vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. + MEMACCESS(1) + "vld1.8 {q0}, [%1]! \n" // next 8 ARGB4444 pixels. + ARGB4444TOARGB + "vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. + "vpadal.u8 d11, d1 \n" // G 8 bytes -> 4 shorts. + "vpadal.u8 d13, d2 \n" // R 8 bytes -> 4 shorts. + + "vrshr.u16 q4, q4, #1 \n" // 2x average + "vrshr.u16 q5, q5, #1 \n" + "vrshr.u16 q6, q6, #1 \n" + + "subs %4, %4, #16 \n" // 16 processed per loop. + "vmul.s16 q8, q4, q10 \n" // B + "vmls.s16 q8, q5, q11 \n" // G + "vmls.s16 q8, q6, q12 \n" // R + "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned + "vmul.s16 q9, q6, q10 \n" // R + "vmls.s16 q9, q5, q14 \n" // G + "vmls.s16 q9, q4, q13 \n" // B + "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned + "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U + "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V + MEMACCESS(2) + "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. + MEMACCESS(3) + "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. + "bgt 1b \n" + : "+r"(src_argb4444), // %0 + "+r"(src_stride_argb4444), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(pix) // %4 + : + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} + +void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int pix) { + asm volatile ( + "vmov.u8 d24, #13 \n" // B * 0.1016 coefficient + "vmov.u8 d25, #65 \n" // G * 0.5078 coefficient + "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient + "vmov.u8 d27, #16 \n" // Add 16 constant + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels. + "subs %2, %2, #8 \n" // 8 processed per loop. + RGB565TOARGB + "vmull.u8 q2, d0, d24 \n" // B + "vmlal.u8 q2, d1, d25 \n" // G + "vmlal.u8 q2, d2, d26 \n" // R + "vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y + "vqadd.u8 d0, d27 \n" + MEMACCESS(1) + "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. + "bgt 1b \n" + : "+r"(src_rgb565), // %0 + "+r"(dst_y), // %1 + "+r"(pix) // %2 + : + : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13" + ); +} + +void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int pix) { + asm volatile ( + "vmov.u8 d24, #13 \n" // B * 0.1016 coefficient + "vmov.u8 d25, #65 \n" // G * 0.5078 coefficient + "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient + "vmov.u8 d27, #16 \n" // Add 16 constant + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels. + "subs %2, %2, #8 \n" // 8 processed per loop. + ARGB1555TOARGB + "vmull.u8 q2, d0, d24 \n" // B + "vmlal.u8 q2, d1, d25 \n" // G + "vmlal.u8 q2, d2, d26 \n" // R + "vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y + "vqadd.u8 d0, d27 \n" + MEMACCESS(1) + "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. + "bgt 1b \n" + : "+r"(src_argb1555), // %0 + "+r"(dst_y), // %1 + "+r"(pix) // %2 + : + : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13" + ); +} + +void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int pix) { + asm volatile ( + "vmov.u8 d24, #13 \n" // B * 0.1016 coefficient + "vmov.u8 d25, #65 \n" // G * 0.5078 coefficient + "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient + "vmov.u8 d27, #16 \n" // Add 16 constant + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels. + "subs %2, %2, #8 \n" // 8 processed per loop. + ARGB4444TOARGB + "vmull.u8 q2, d0, d24 \n" // B + "vmlal.u8 q2, d1, d25 \n" // G + "vmlal.u8 q2, d2, d26 \n" // R + "vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y + "vqadd.u8 d0, d27 \n" + MEMACCESS(1) + "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. + "bgt 1b \n" + : "+r"(src_argb4444), // %0 + "+r"(dst_y), // %1 + "+r"(pix) // %2 + : + : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13" + ); +} + +void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int pix) { + asm volatile ( + "vmov.u8 d4, #33 \n" // R * 0.2578 coefficient + "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient + "vmov.u8 d6, #13 \n" // B * 0.1016 coefficient + "vmov.u8 d7, #16 \n" // Add 16 constant + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of BGRA. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vmull.u8 q8, d1, d4 \n" // R + "vmlal.u8 q8, d2, d5 \n" // G + "vmlal.u8 q8, d3, d6 \n" // B + "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y + "vqadd.u8 d0, d7 \n" + MEMACCESS(1) + "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. + "bgt 1b \n" + : "+r"(src_bgra), // %0 + "+r"(dst_y), // %1 + "+r"(pix) // %2 + : + : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8" + ); +} + +void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int pix) { + asm volatile ( + "vmov.u8 d4, #33 \n" // R * 0.2578 coefficient + "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient + "vmov.u8 d6, #13 \n" // B * 0.1016 coefficient + "vmov.u8 d7, #16 \n" // Add 16 constant + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ABGR. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vmull.u8 q8, d0, d4 \n" // R + "vmlal.u8 q8, d1, d5 \n" // G + "vmlal.u8 q8, d2, d6 \n" // B + "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y + "vqadd.u8 d0, d7 \n" + MEMACCESS(1) + "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. + "bgt 1b \n" + : "+r"(src_abgr), // %0 + "+r"(dst_y), // %1 + "+r"(pix) // %2 + : + : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8" + ); +} + +void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int pix) { + asm volatile ( + "vmov.u8 d4, #13 \n" // B * 0.1016 coefficient + "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient + "vmov.u8 d6, #33 \n" // R * 0.2578 coefficient + "vmov.u8 d7, #16 \n" // Add 16 constant + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of RGBA. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vmull.u8 q8, d1, d4 \n" // B + "vmlal.u8 q8, d2, d5 \n" // G + "vmlal.u8 q8, d3, d6 \n" // R + "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y + "vqadd.u8 d0, d7 \n" + MEMACCESS(1) + "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. + "bgt 1b \n" + : "+r"(src_rgba), // %0 + "+r"(dst_y), // %1 + "+r"(pix) // %2 + : + : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8" + ); +} + +void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int pix) { + asm volatile ( + "vmov.u8 d4, #13 \n" // B * 0.1016 coefficient + "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient + "vmov.u8 d6, #33 \n" // R * 0.2578 coefficient + "vmov.u8 d7, #16 \n" // Add 16 constant + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RGB24. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vmull.u8 q8, d0, d4 \n" // B + "vmlal.u8 q8, d1, d5 \n" // G + "vmlal.u8 q8, d2, d6 \n" // R + "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y + "vqadd.u8 d0, d7 \n" + MEMACCESS(1) + "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. + "bgt 1b \n" + : "+r"(src_rgb24), // %0 + "+r"(dst_y), // %1 + "+r"(pix) // %2 + : + : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8" + ); +} + +void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int pix) { + asm volatile ( + "vmov.u8 d4, #33 \n" // R * 0.2578 coefficient + "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient + "vmov.u8 d6, #13 \n" // B * 0.1016 coefficient + "vmov.u8 d7, #16 \n" // Add 16 constant + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RAW. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vmull.u8 q8, d0, d4 \n" // B + "vmlal.u8 q8, d1, d5 \n" // G + "vmlal.u8 q8, d2, d6 \n" // R + "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y + "vqadd.u8 d0, d7 \n" + MEMACCESS(1) + "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. + "bgt 1b \n" + : "+r"(src_raw), // %0 + "+r"(dst_y), // %1 + "+r"(pix) // %2 + : + : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8" + ); +} + +// Bilinear filter 16x2 -> 16x1 +void InterpolateRow_NEON(uint8* dst_ptr, + const uint8* src_ptr, ptrdiff_t src_stride, + int dst_width, int source_y_fraction) { + asm volatile ( + "cmp %4, #0 \n" + "beq 100f \n" + "add %2, %1 \n" + "cmp %4, #64 \n" + "beq 75f \n" + "cmp %4, #128 \n" + "beq 50f \n" + "cmp %4, #192 \n" + "beq 25f \n" + + "vdup.8 d5, %4 \n" + "rsb %4, #256 \n" + "vdup.8 d4, %4 \n" + // General purpose row blend. + "1: \n" + MEMACCESS(1) + "vld1.8 {q0}, [%1]! \n" + MEMACCESS(2) + "vld1.8 {q1}, [%2]! \n" + "subs %3, %3, #16 \n" + "vmull.u8 q13, d0, d4 \n" + "vmull.u8 q14, d1, d4 \n" + "vmlal.u8 q13, d2, d5 \n" + "vmlal.u8 q14, d3, d5 \n" + "vrshrn.u16 d0, q13, #8 \n" + "vrshrn.u16 d1, q14, #8 \n" + MEMACCESS(0) + "vst1.8 {q0}, [%0]! \n" + "bgt 1b \n" + "b 99f \n" + + // Blend 25 / 75. + "25: \n" + MEMACCESS(1) + "vld1.8 {q0}, [%1]! \n" + MEMACCESS(2) + "vld1.8 {q1}, [%2]! \n" + "subs %3, %3, #16 \n" + "vrhadd.u8 q0, q1 \n" + "vrhadd.u8 q0, q1 \n" + MEMACCESS(0) + "vst1.8 {q0}, [%0]! \n" + "bgt 25b \n" + "b 99f \n" + + // Blend 50 / 50. + "50: \n" + MEMACCESS(1) + "vld1.8 {q0}, [%1]! \n" + MEMACCESS(2) + "vld1.8 {q1}, [%2]! \n" + "subs %3, %3, #16 \n" + "vrhadd.u8 q0, q1 \n" + MEMACCESS(0) + "vst1.8 {q0}, [%0]! \n" + "bgt 50b \n" + "b 99f \n" + + // Blend 75 / 25. + "75: \n" + MEMACCESS(1) + "vld1.8 {q1}, [%1]! \n" + MEMACCESS(2) + "vld1.8 {q0}, [%2]! \n" + "subs %3, %3, #16 \n" + "vrhadd.u8 q0, q1 \n" + "vrhadd.u8 q0, q1 \n" + MEMACCESS(0) + "vst1.8 {q0}, [%0]! \n" + "bgt 75b \n" + "b 99f \n" + + // Blend 100 / 0 - Copy row unchanged. + "100: \n" + MEMACCESS(1) + "vld1.8 {q0}, [%1]! \n" + "subs %3, %3, #16 \n" + MEMACCESS(0) + "vst1.8 {q0}, [%0]! \n" + "bgt 100b \n" + + "99: \n" + : "+r"(dst_ptr), // %0 + "+r"(src_ptr), // %1 + "+r"(src_stride), // %2 + "+r"(dst_width), // %3 + "+r"(source_y_fraction) // %4 + : + : "cc", "memory", "q0", "q1", "d4", "d5", "q13", "q14" + ); +} + +// dr * (256 - sa) / 256 + sr = dr - dr * sa / 256 + sr +void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1, + uint8* dst_argb, int width) { + asm volatile ( + "subs %3, #8 \n" + "blt 89f \n" + // Blend 8 pixels. + "8: \n" + MEMACCESS(0) + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ARGB0. + MEMACCESS(1) + "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 pixels of ARGB1. + "subs %3, %3, #8 \n" // 8 processed per loop. + "vmull.u8 q10, d4, d3 \n" // db * a + "vmull.u8 q11, d5, d3 \n" // dg * a + "vmull.u8 q12, d6, d3 \n" // dr * a + "vqrshrn.u16 d20, q10, #8 \n" // db >>= 8 + "vqrshrn.u16 d21, q11, #8 \n" // dg >>= 8 + "vqrshrn.u16 d22, q12, #8 \n" // dr >>= 8 + "vqsub.u8 q2, q2, q10 \n" // dbg - dbg * a / 256 + "vqsub.u8 d6, d6, d22 \n" // dr - dr * a / 256 + "vqadd.u8 q0, q0, q2 \n" // + sbg + "vqadd.u8 d2, d2, d6 \n" // + sr + "vmov.u8 d3, #255 \n" // a = 255 + MEMACCESS(2) + "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 pixels of ARGB. + "bge 8b \n" + + "89: \n" + "adds %3, #8-1 \n" + "blt 99f \n" + + // Blend 1 pixels. + "1: \n" + MEMACCESS(0) + "vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [%0]! \n" // load 1 pixel ARGB0. + MEMACCESS(1) + "vld4.8 {d4[0],d5[0],d6[0],d7[0]}, [%1]! \n" // load 1 pixel ARGB1. + "subs %3, %3, #1 \n" // 1 processed per loop. + "vmull.u8 q10, d4, d3 \n" // db * a + "vmull.u8 q11, d5, d3 \n" // dg * a + "vmull.u8 q12, d6, d3 \n" // dr * a + "vqrshrn.u16 d20, q10, #8 \n" // db >>= 8 + "vqrshrn.u16 d21, q11, #8 \n" // dg >>= 8 + "vqrshrn.u16 d22, q12, #8 \n" // dr >>= 8 + "vqsub.u8 q2, q2, q10 \n" // dbg - dbg * a / 256 + "vqsub.u8 d6, d6, d22 \n" // dr - dr * a / 256 + "vqadd.u8 q0, q0, q2 \n" // + sbg + "vqadd.u8 d2, d2, d6 \n" // + sr + "vmov.u8 d3, #255 \n" // a = 255 + MEMACCESS(2) + "vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [%2]! \n" // store 1 pixel. + "bge 1b \n" + + "99: \n" + + : "+r"(src_argb0), // %0 + "+r"(src_argb1), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "q0", "q1", "q2", "q3", "q10", "q11", "q12" + ); +} + +// Attenuate 8 pixels at a time. +void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) { + asm volatile ( + // Attenuate 8 pixels. + "1: \n" + MEMACCESS(0) + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ARGB. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vmull.u8 q10, d0, d3 \n" // b * a + "vmull.u8 q11, d1, d3 \n" // g * a + "vmull.u8 q12, d2, d3 \n" // r * a + "vqrshrn.u16 d0, q10, #8 \n" // b >>= 8 + "vqrshrn.u16 d1, q11, #8 \n" // g >>= 8 + "vqrshrn.u16 d2, q12, #8 \n" // r >>= 8 + MEMACCESS(1) + "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "q0", "q1", "q10", "q11", "q12" + ); +} + +// Quantize 8 ARGB pixels (32 bytes). +// dst = (dst * scale >> 16) * interval_size + interval_offset; +void ARGBQuantizeRow_NEON(uint8* dst_argb, int scale, int interval_size, + int interval_offset, int width) { + asm volatile ( + "vdup.u16 q8, %2 \n" + "vshr.u16 q8, q8, #1 \n" // scale >>= 1 + "vdup.u16 q9, %3 \n" // interval multiply. + "vdup.u16 q10, %4 \n" // interval add + + // 8 pixel loop. + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld4.8 {d0, d2, d4, d6}, [%0] \n" // load 8 pixels of ARGB. + "subs %1, %1, #8 \n" // 8 processed per loop. + "vmovl.u8 q0, d0 \n" // b (0 .. 255) + "vmovl.u8 q1, d2 \n" + "vmovl.u8 q2, d4 \n" + "vqdmulh.s16 q0, q0, q8 \n" // b * scale + "vqdmulh.s16 q1, q1, q8 \n" // g + "vqdmulh.s16 q2, q2, q8 \n" // r + "vmul.u16 q0, q0, q9 \n" // b * interval_size + "vmul.u16 q1, q1, q9 \n" // g + "vmul.u16 q2, q2, q9 \n" // r + "vadd.u16 q0, q0, q10 \n" // b + interval_offset + "vadd.u16 q1, q1, q10 \n" // g + "vadd.u16 q2, q2, q10 \n" // r + "vqmovn.u16 d0, q0 \n" + "vqmovn.u16 d2, q1 \n" + "vqmovn.u16 d4, q2 \n" + MEMACCESS(0) + "vst4.8 {d0, d2, d4, d6}, [%0]! \n" // store 8 pixels of ARGB. + "bgt 1b \n" + : "+r"(dst_argb), // %0 + "+r"(width) // %1 + : "r"(scale), // %2 + "r"(interval_size), // %3 + "r"(interval_offset) // %4 + : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10" + ); +} + +// Shade 8 pixels at a time by specified value. +// NOTE vqrdmulh.s16 q10, q10, d0[0] must use a scaler register from 0 to 8. +// Rounding in vqrdmulh does +1 to high if high bit of low s16 is set. +void ARGBShadeRow_NEON(const uint8* src_argb, uint8* dst_argb, int width, + uint32 value) { + asm volatile ( + "vdup.u32 q0, %3 \n" // duplicate scale value. + "vzip.u8 d0, d1 \n" // d0 aarrggbb. + "vshr.u16 q0, q0, #1 \n" // scale / 2. + + // 8 pixel loop. + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld4.8 {d20, d22, d24, d26}, [%0]! \n" // load 8 pixels of ARGB. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vmovl.u8 q10, d20 \n" // b (0 .. 255) + "vmovl.u8 q11, d22 \n" + "vmovl.u8 q12, d24 \n" + "vmovl.u8 q13, d26 \n" + "vqrdmulh.s16 q10, q10, d0[0] \n" // b * scale * 2 + "vqrdmulh.s16 q11, q11, d0[1] \n" // g + "vqrdmulh.s16 q12, q12, d0[2] \n" // r + "vqrdmulh.s16 q13, q13, d0[3] \n" // a + "vqmovn.u16 d20, q10 \n" + "vqmovn.u16 d22, q11 \n" + "vqmovn.u16 d24, q12 \n" + "vqmovn.u16 d26, q13 \n" + MEMACCESS(1) + "vst4.8 {d20, d22, d24, d26}, [%1]! \n" // store 8 pixels of ARGB. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "r"(value) // %3 + : "cc", "memory", "q0", "q10", "q11", "q12", "q13" + ); +} + +// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels +// Similar to ARGBToYJ but stores ARGB. +// C code is (15 * b + 75 * g + 38 * r + 64) >> 7; +void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) { + asm volatile ( + "vmov.u8 d24, #15 \n" // B * 0.11400 coefficient + "vmov.u8 d25, #75 \n" // G * 0.58700 coefficient + "vmov.u8 d26, #38 \n" // R * 0.29900 coefficient + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vmull.u8 q2, d0, d24 \n" // B + "vmlal.u8 q2, d1, d25 \n" // G + "vmlal.u8 q2, d2, d26 \n" // R + "vqrshrun.s16 d0, q2, #7 \n" // 15 bit to 8 bit B + "vmov d1, d0 \n" // G + "vmov d2, d0 \n" // R + MEMACCESS(1) + "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 ARGB pixels. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "q0", "q1", "q2", "q12", "q13" + ); +} + +// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels. +// b = (r * 35 + g * 68 + b * 17) >> 7 +// g = (r * 45 + g * 88 + b * 22) >> 7 +// r = (r * 50 + g * 98 + b * 24) >> 7 +void ARGBSepiaRow_NEON(uint8* dst_argb, int width) { + asm volatile ( + "vmov.u8 d20, #17 \n" // BB coefficient + "vmov.u8 d21, #68 \n" // BG coefficient + "vmov.u8 d22, #35 \n" // BR coefficient + "vmov.u8 d24, #22 \n" // GB coefficient + "vmov.u8 d25, #88 \n" // GG coefficient + "vmov.u8 d26, #45 \n" // GR coefficient + "vmov.u8 d28, #24 \n" // BB coefficient + "vmov.u8 d29, #98 \n" // BG coefficient + "vmov.u8 d30, #50 \n" // BR coefficient + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld4.8 {d0, d1, d2, d3}, [%0] \n" // load 8 ARGB pixels. + "subs %1, %1, #8 \n" // 8 processed per loop. + "vmull.u8 q2, d0, d20 \n" // B to Sepia B + "vmlal.u8 q2, d1, d21 \n" // G + "vmlal.u8 q2, d2, d22 \n" // R + "vmull.u8 q3, d0, d24 \n" // B to Sepia G + "vmlal.u8 q3, d1, d25 \n" // G + "vmlal.u8 q3, d2, d26 \n" // R + "vmull.u8 q8, d0, d28 \n" // B to Sepia R + "vmlal.u8 q8, d1, d29 \n" // G + "vmlal.u8 q8, d2, d30 \n" // R + "vqshrn.u16 d0, q2, #7 \n" // 16 bit to 8 bit B + "vqshrn.u16 d1, q3, #7 \n" // 16 bit to 8 bit G + "vqshrn.u16 d2, q8, #7 \n" // 16 bit to 8 bit R + MEMACCESS(0) + "vst4.8 {d0, d1, d2, d3}, [%0]! \n" // store 8 ARGB pixels. + "bgt 1b \n" + : "+r"(dst_argb), // %0 + "+r"(width) // %1 + : + : "cc", "memory", "q0", "q1", "q2", "q3", + "q10", "q11", "q12", "q13", "q14", "q15" + ); +} + +// Tranform 8 ARGB pixels (32 bytes) with color matrix. +// TODO(fbarchard): Was same as Sepia except matrix is provided. This function +// needs to saturate. Consider doing a non-saturating version. +void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb, + const int8* matrix_argb, int width) { + asm volatile ( + MEMACCESS(3) + "vld1.8 {q2}, [%3] \n" // load 3 ARGB vectors. + "vmovl.s8 q0, d4 \n" // B,G coefficients s16. + "vmovl.s8 q1, d5 \n" // R,A coefficients s16. + + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld4.8 {d16, d18, d20, d22}, [%0]! \n" // load 8 ARGB pixels. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vmovl.u8 q8, d16 \n" // b (0 .. 255) 16 bit + "vmovl.u8 q9, d18 \n" // g + "vmovl.u8 q10, d20 \n" // r + "vmovl.u8 q15, d22 \n" // a + "vmul.s16 q12, q8, d0[0] \n" // B = B * Matrix B + "vmul.s16 q13, q8, d1[0] \n" // G = B * Matrix G + "vmul.s16 q14, q8, d2[0] \n" // R = B * Matrix R + "vmul.s16 q15, q8, d3[0] \n" // A = B * Matrix A + "vmul.s16 q4, q9, d0[1] \n" // B += G * Matrix B + "vmul.s16 q5, q9, d1[1] \n" // G += G * Matrix G + "vmul.s16 q6, q9, d2[1] \n" // R += G * Matrix R + "vmul.s16 q7, q9, d3[1] \n" // A += G * Matrix A + "vqadd.s16 q12, q12, q4 \n" // Accumulate B + "vqadd.s16 q13, q13, q5 \n" // Accumulate G + "vqadd.s16 q14, q14, q6 \n" // Accumulate R + "vqadd.s16 q15, q15, q7 \n" // Accumulate A + "vmul.s16 q4, q10, d0[2] \n" // B += R * Matrix B + "vmul.s16 q5, q10, d1[2] \n" // G += R * Matrix G + "vmul.s16 q6, q10, d2[2] \n" // R += R * Matrix R + "vmul.s16 q7, q10, d3[2] \n" // A += R * Matrix A + "vqadd.s16 q12, q12, q4 \n" // Accumulate B + "vqadd.s16 q13, q13, q5 \n" // Accumulate G + "vqadd.s16 q14, q14, q6 \n" // Accumulate R + "vqadd.s16 q15, q15, q7 \n" // Accumulate A + "vmul.s16 q4, q15, d0[3] \n" // B += A * Matrix B + "vmul.s16 q5, q15, d1[3] \n" // G += A * Matrix G + "vmul.s16 q6, q15, d2[3] \n" // R += A * Matrix R + "vmul.s16 q7, q15, d3[3] \n" // A += A * Matrix A + "vqadd.s16 q12, q12, q4 \n" // Accumulate B + "vqadd.s16 q13, q13, q5 \n" // Accumulate G + "vqadd.s16 q14, q14, q6 \n" // Accumulate R + "vqadd.s16 q15, q15, q7 \n" // Accumulate A + "vqshrun.s16 d16, q12, #6 \n" // 16 bit to 8 bit B + "vqshrun.s16 d18, q13, #6 \n" // 16 bit to 8 bit G + "vqshrun.s16 d20, q14, #6 \n" // 16 bit to 8 bit R + "vqshrun.s16 d22, q15, #6 \n" // 16 bit to 8 bit A + MEMACCESS(1) + "vst4.8 {d16, d18, d20, d22}, [%1]! \n" // store 8 ARGB pixels. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "r"(matrix_argb) // %3 + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", + "q10", "q11", "q12", "q13", "q14", "q15" + ); +} + +// TODO(fbarchard): fix vqshrun in ARGBMultiplyRow_NEON and reenable. +#ifdef HAS_ARGBMULTIPLYROW_NEON +// Multiply 2 rows of ARGB pixels together, 8 pixels at a time. +void ARGBMultiplyRow_NEON(const uint8* src_argb0, const uint8* src_argb1, + uint8* dst_argb, int width) { + asm volatile ( + // 8 pixel loop. + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. + MEMACCESS(1) + "vld4.8 {d1, d3, d5, d7}, [%1]! \n" // load 8 more ARGB pixels. + "subs %3, %3, #8 \n" // 8 processed per loop. + "vmull.u8 q0, d0, d1 \n" // multiply B + "vmull.u8 q1, d2, d3 \n" // multiply G + "vmull.u8 q2, d4, d5 \n" // multiply R + "vmull.u8 q3, d6, d7 \n" // multiply A + "vrshrn.u16 d0, q0, #8 \n" // 16 bit to 8 bit B + "vrshrn.u16 d1, q1, #8 \n" // 16 bit to 8 bit G + "vrshrn.u16 d2, q2, #8 \n" // 16 bit to 8 bit R + "vrshrn.u16 d3, q3, #8 \n" // 16 bit to 8 bit A + MEMACCESS(2) + "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. + "bgt 1b \n" + + : "+r"(src_argb0), // %0 + "+r"(src_argb1), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "q0", "q1", "q2", "q3" + ); +} +#endif // HAS_ARGBMULTIPLYROW_NEON + +// Add 2 rows of ARGB pixels together, 8 pixels at a time. +void ARGBAddRow_NEON(const uint8* src_argb0, const uint8* src_argb1, + uint8* dst_argb, int width) { + asm volatile ( + // 8 pixel loop. + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. + MEMACCESS(1) + "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 more ARGB pixels. + "subs %3, %3, #8 \n" // 8 processed per loop. + "vqadd.u8 q0, q0, q2 \n" // add B, G + "vqadd.u8 q1, q1, q3 \n" // add R, A + MEMACCESS(2) + "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. + "bgt 1b \n" + + : "+r"(src_argb0), // %0 + "+r"(src_argb1), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "q0", "q1", "q2", "q3" + ); +} + +// Subtract 2 rows of ARGB pixels, 8 pixels at a time. +void ARGBSubtractRow_NEON(const uint8* src_argb0, const uint8* src_argb1, + uint8* dst_argb, int width) { + asm volatile ( + // 8 pixel loop. + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. + MEMACCESS(1) + "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 more ARGB pixels. + "subs %3, %3, #8 \n" // 8 processed per loop. + "vqsub.u8 q0, q0, q2 \n" // subtract B, G + "vqsub.u8 q1, q1, q3 \n" // subtract R, A + MEMACCESS(2) + "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. + "bgt 1b \n" + + : "+r"(src_argb0), // %0 + "+r"(src_argb1), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "q0", "q1", "q2", "q3" + ); +} + +// Adds Sobel X and Sobel Y and stores Sobel into ARGB. +// A = 255 +// R = Sobel +// G = Sobel +// B = Sobel +void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, + uint8* dst_argb, int width) { + asm volatile ( + "vmov.u8 d3, #255 \n" // alpha + // 8 pixel loop. + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld1.8 {d0}, [%0]! \n" // load 8 sobelx. + MEMACCESS(1) + "vld1.8 {d1}, [%1]! \n" // load 8 sobely. + "subs %3, %3, #8 \n" // 8 processed per loop. + "vqadd.u8 d0, d0, d1 \n" // add + "vmov.u8 d1, d0 \n" + "vmov.u8 d2, d0 \n" + MEMACCESS(2) + "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. + "bgt 1b \n" + : "+r"(src_sobelx), // %0 + "+r"(src_sobely), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "q0", "q1" + ); +} + +// Adds Sobel X and Sobel Y and stores Sobel into plane. +void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, + uint8* dst_y, int width) { + asm volatile ( + // 16 pixel loop. + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld1.8 {q0}, [%0]! \n" // load 16 sobelx. + MEMACCESS(1) + "vld1.8 {q1}, [%1]! \n" // load 16 sobely. + "subs %3, %3, #16 \n" // 16 processed per loop. + "vqadd.u8 q0, q0, q1 \n" // add + MEMACCESS(2) + "vst1.8 {q0}, [%2]! \n" // store 16 pixels. + "bgt 1b \n" + : "+r"(src_sobelx), // %0 + "+r"(src_sobely), // %1 + "+r"(dst_y), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "q0", "q1" + ); +} + +// Mixes Sobel X, Sobel Y and Sobel into ARGB. +// A = 255 +// R = Sobel X +// G = Sobel +// B = Sobel Y +void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, + uint8* dst_argb, int width) { + asm volatile ( + "vmov.u8 d3, #255 \n" // alpha + // 8 pixel loop. + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld1.8 {d2}, [%0]! \n" // load 8 sobelx. + MEMACCESS(1) + "vld1.8 {d0}, [%1]! \n" // load 8 sobely. + "subs %3, %3, #8 \n" // 8 processed per loop. + "vqadd.u8 d1, d0, d2 \n" // add + MEMACCESS(2) + "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. + "bgt 1b \n" + : "+r"(src_sobelx), // %0 + "+r"(src_sobely), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "q0", "q1" + ); +} + +// SobelX as a matrix is +// -1 0 1 +// -2 0 2 +// -1 0 1 +void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1, + const uint8* src_y2, uint8* dst_sobelx, int width) { + asm volatile ( + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld1.8 {d0}, [%0],%5 \n" // top + MEMACCESS(0) + "vld1.8 {d1}, [%0],%6 \n" + "vsubl.u8 q0, d0, d1 \n" + MEMACCESS(1) + "vld1.8 {d2}, [%1],%5 \n" // center * 2 + MEMACCESS(1) + "vld1.8 {d3}, [%1],%6 \n" + "vsubl.u8 q1, d2, d3 \n" + "vadd.s16 q0, q0, q1 \n" + "vadd.s16 q0, q0, q1 \n" + MEMACCESS(2) + "vld1.8 {d2}, [%2],%5 \n" // bottom + MEMACCESS(2) + "vld1.8 {d3}, [%2],%6 \n" + "subs %4, %4, #8 \n" // 8 pixels + "vsubl.u8 q1, d2, d3 \n" + "vadd.s16 q0, q0, q1 \n" + "vabs.s16 q0, q0 \n" + "vqmovn.u16 d0, q0 \n" + MEMACCESS(3) + "vst1.8 {d0}, [%3]! \n" // store 8 sobelx + "bgt 1b \n" + : "+r"(src_y0), // %0 + "+r"(src_y1), // %1 + "+r"(src_y2), // %2 + "+r"(dst_sobelx), // %3 + "+r"(width) // %4 + : "r"(2), // %5 + "r"(6) // %6 + : "cc", "memory", "q0", "q1" // Clobber List + ); +} + +// SobelY as a matrix is +// -1 -2 -1 +// 0 0 0 +// 1 2 1 +void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1, + uint8* dst_sobely, int width) { + asm volatile ( + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld1.8 {d0}, [%0],%4 \n" // left + MEMACCESS(1) + "vld1.8 {d1}, [%1],%4 \n" + "vsubl.u8 q0, d0, d1 \n" + MEMACCESS(0) + "vld1.8 {d2}, [%0],%4 \n" // center * 2 + MEMACCESS(1) + "vld1.8 {d3}, [%1],%4 \n" + "vsubl.u8 q1, d2, d3 \n" + "vadd.s16 q0, q0, q1 \n" + "vadd.s16 q0, q0, q1 \n" + MEMACCESS(0) + "vld1.8 {d2}, [%0],%5 \n" // right + MEMACCESS(1) + "vld1.8 {d3}, [%1],%5 \n" + "subs %3, %3, #8 \n" // 8 pixels + "vsubl.u8 q1, d2, d3 \n" + "vadd.s16 q0, q0, q1 \n" + "vabs.s16 q0, q0 \n" + "vqmovn.u16 d0, q0 \n" + MEMACCESS(2) + "vst1.8 {d0}, [%2]! \n" // store 8 sobely + "bgt 1b \n" + : "+r"(src_y0), // %0 + "+r"(src_y1), // %1 + "+r"(dst_sobely), // %2 + "+r"(width) // %3 + : "r"(1), // %4 + "r"(6) // %5 + : "cc", "memory", "q0", "q1" // Clobber List + ); +} +#endif // __ARM_NEON__ + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/TMessagesProj/jni/libyuv/source/row_neon64.cc b/TMessagesProj/jni/libyuv/source/row_neon64.cc new file mode 100644 index 000000000..952e10d73 --- /dev/null +++ b/TMessagesProj/jni/libyuv/source/row_neon64.cc @@ -0,0 +1,3327 @@ +/* + * Copyright 2014 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/row.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// This module is for GCC Neon +#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) + +// Read 8 Y, 4 U and 4 V from 422 +#define READYUV422 \ + MEMACCESS(0) \ + "vld1.8 {d0}, [%0]! \n" \ + MEMACCESS(1) \ + "vld1.32 {d2[0]}, [%1]! \n" \ + MEMACCESS(2) \ + "vld1.32 {d2[1]}, [%2]! \n" + +// Read 8 Y, 2 U and 2 V from 422 +#define READYUV411 \ + MEMACCESS(0) \ + "vld1.8 {d0}, [%0]! \n" \ + MEMACCESS(1) \ + "vld1.16 {d2[0]}, [%1]! \n" \ + MEMACCESS(2) \ + "vld1.16 {d2[1]}, [%2]! \n" \ + "vmov.u8 d3, d2 \n" \ + "vzip.u8 d2, d3 \n" + +// Read 8 Y, 8 U and 8 V from 444 +#define READYUV444 \ + MEMACCESS(0) \ + "vld1.8 {d0}, [%0]! \n" \ + MEMACCESS(1) \ + "vld1.8 {d2}, [%1]! \n" \ + MEMACCESS(2) \ + "vld1.8 {d3}, [%2]! \n" \ + "vpaddl.u8 q1, q1 \n" \ + "vrshrn.u16 d2, q1, #1 \n" + +// Read 8 Y, and set 4 U and 4 V to 128 +#define READYUV400 \ + MEMACCESS(0) \ + "vld1.8 {d0}, [%0]! \n" \ + "vmov.u8 d2, #128 \n" + +// Read 8 Y and 4 UV from NV12 +#define READNV12 \ + MEMACCESS(0) \ + "vld1.8 {d0}, [%0]! \n" \ + MEMACCESS(1) \ + "vld1.8 {d2}, [%1]! \n" \ + "vmov.u8 d3, d2 \n"/* split odd/even uv apart */\ + "vuzp.u8 d2, d3 \n" \ + "vtrn.u32 d2, d3 \n" + +// Read 8 Y and 4 VU from NV21 +#define READNV21 \ + MEMACCESS(0) \ + "vld1.8 {d0}, [%0]! \n" \ + MEMACCESS(1) \ + "vld1.8 {d2}, [%1]! \n" \ + "vmov.u8 d3, d2 \n"/* split odd/even uv apart */\ + "vuzp.u8 d3, d2 \n" \ + "vtrn.u32 d2, d3 \n" + +// Read 8 YUY2 +#define READYUY2 \ + MEMACCESS(0) \ + "vld2.8 {d0, d2}, [%0]! \n" \ + "vmov.u8 d3, d2 \n" \ + "vuzp.u8 d2, d3 \n" \ + "vtrn.u32 d2, d3 \n" + +// Read 8 UYVY +#define READUYVY \ + MEMACCESS(0) \ + "vld2.8 {d2, d3}, [%0]! \n" \ + "vmov.u8 d0, d3 \n" \ + "vmov.u8 d3, d2 \n" \ + "vuzp.u8 d2, d3 \n" \ + "vtrn.u32 d2, d3 \n" + +#define YUV422TORGB \ + "veor.u8 d2, d26 \n"/*subtract 128 from u and v*/\ + "vmull.s8 q8, d2, d24 \n"/* u/v B/R component */\ + "vmull.s8 q9, d2, d25 \n"/* u/v G component */\ + "vmov.u8 d1, #0 \n"/* split odd/even y apart */\ + "vtrn.u8 d0, d1 \n" \ + "vsub.s16 q0, q0, q15 \n"/* offset y */\ + "vmul.s16 q0, q0, q14 \n" \ + "vadd.s16 d18, d19 \n" \ + "vqadd.s16 d20, d0, d16 \n" /* B */ \ + "vqadd.s16 d21, d1, d16 \n" \ + "vqadd.s16 d22, d0, d17 \n" /* R */ \ + "vqadd.s16 d23, d1, d17 \n" \ + "vqadd.s16 d16, d0, d18 \n" /* G */ \ + "vqadd.s16 d17, d1, d18 \n" \ + "vqshrun.s16 d0, q10, #6 \n" /* B */ \ + "vqshrun.s16 d1, q11, #6 \n" /* G */ \ + "vqshrun.s16 d2, q8, #6 \n" /* R */ \ + "vmovl.u8 q10, d0 \n"/* set up for reinterleave*/\ + "vmovl.u8 q11, d1 \n" \ + "vmovl.u8 q8, d2 \n" \ + "vtrn.u8 d20, d21 \n" \ + "vtrn.u8 d22, d23 \n" \ + "vtrn.u8 d16, d17 \n" \ + "vmov.u8 d21, d16 \n" + +static vec8 kUVToRB = { 127, 127, 127, 127, 102, 102, 102, 102, + 0, 0, 0, 0, 0, 0, 0, 0 }; +static vec8 kUVToG = { -25, -25, -25, -25, -52, -52, -52, -52, + 0, 0, 0, 0, 0, 0, 0, 0 }; + +#ifdef HAS_I444TOARGBROW_NEON +void I444ToARGBRow_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width) { + asm volatile ( + MEMACCESS(5) + "vld1.8 {d24}, [%5] \n" + MEMACCESS(6) + "vld1.8 {d25}, [%6] \n" + "vmov.u8 d26, #128 \n" + "vmov.u16 q14, #74 \n" + "vmov.u16 q15, #16 \n" + ".p2align 2 \n" + "1: \n" + READYUV444 + YUV422TORGB + "subs %4, %4, #8 \n" + "vmov.u8 d23, #255 \n" + MEMACCESS(3) + "vst4.8 {d20, d21, d22, d23}, [%3]! \n" + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_argb), // %3 + "+r"(width) // %4 + : "r"(&kUVToRB), // %5 + "r"(&kUVToG) // %6 + : "cc", "memory", "q0", "q1", "q2", "q3", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} +#endif // HAS_I444TOARGBROW_NEON + +#ifdef HAS_I422TOARGBROW_NEON +void I422ToARGBRow_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width) { + asm volatile ( + MEMACCESS(5) + "vld1.8 {d24}, [%5] \n" + MEMACCESS(6) + "vld1.8 {d25}, [%6] \n" + "vmov.u8 d26, #128 \n" + "vmov.u16 q14, #74 \n" + "vmov.u16 q15, #16 \n" + ".p2align 2 \n" + "1: \n" + READYUV422 + YUV422TORGB + "subs %4, %4, #8 \n" + "vmov.u8 d23, #255 \n" + MEMACCESS(3) + "vst4.8 {d20, d21, d22, d23}, [%3]! \n" + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_argb), // %3 + "+r"(width) // %4 + : "r"(&kUVToRB), // %5 + "r"(&kUVToG) // %6 + : "cc", "memory", "q0", "q1", "q2", "q3", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} +#endif // HAS_I422TOARGBROW_NEON + +#ifdef HAS_I411TOARGBROW_NEON +void I411ToARGBRow_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width) { + asm volatile ( + MEMACCESS(5) + "vld1.8 {d24}, [%5] \n" + MEMACCESS(6) + "vld1.8 {d25}, [%6] \n" + "vmov.u8 d26, #128 \n" + "vmov.u16 q14, #74 \n" + "vmov.u16 q15, #16 \n" + ".p2align 2 \n" + "1: \n" + READYUV411 + YUV422TORGB + "subs %4, %4, #8 \n" + "vmov.u8 d23, #255 \n" + MEMACCESS(3) + "vst4.8 {d20, d21, d22, d23}, [%3]! \n" + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_argb), // %3 + "+r"(width) // %4 + : "r"(&kUVToRB), // %5 + "r"(&kUVToG) // %6 + : "cc", "memory", "q0", "q1", "q2", "q3", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} +#endif // HAS_I411TOARGBROW_NEON + +#ifdef HAS_I422TOBGRAROW_NEON +void I422ToBGRARow_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_bgra, + int width) { + asm volatile ( + MEMACCESS(5) + "vld1.8 {d24}, [%5] \n" + MEMACCESS(6) + "vld1.8 {d25}, [%6] \n" + "vmov.u8 d26, #128 \n" + "vmov.u16 q14, #74 \n" + "vmov.u16 q15, #16 \n" + ".p2align 2 \n" + "1: \n" + READYUV422 + YUV422TORGB + "subs %4, %4, #8 \n" + "vswp.u8 d20, d22 \n" + "vmov.u8 d19, #255 \n" + MEMACCESS(3) + "vst4.8 {d19, d20, d21, d22}, [%3]! \n" + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_bgra), // %3 + "+r"(width) // %4 + : "r"(&kUVToRB), // %5 + "r"(&kUVToG) // %6 + : "cc", "memory", "q0", "q1", "q2", "q3", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} +#endif // HAS_I422TOBGRAROW_NEON + +#ifdef HAS_I422TOABGRROW_NEON +void I422ToABGRRow_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_abgr, + int width) { + asm volatile ( + MEMACCESS(5) + "vld1.8 {d24}, [%5] \n" + MEMACCESS(6) + "vld1.8 {d25}, [%6] \n" + "vmov.u8 d26, #128 \n" + "vmov.u16 q14, #74 \n" + "vmov.u16 q15, #16 \n" + ".p2align 2 \n" + "1: \n" + READYUV422 + YUV422TORGB + "subs %4, %4, #8 \n" + "vswp.u8 d20, d22 \n" + "vmov.u8 d23, #255 \n" + MEMACCESS(3) + "vst4.8 {d20, d21, d22, d23}, [%3]! \n" + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_abgr), // %3 + "+r"(width) // %4 + : "r"(&kUVToRB), // %5 + "r"(&kUVToG) // %6 + : "cc", "memory", "q0", "q1", "q2", "q3", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} +#endif // HAS_I422TOABGRROW_NEON + +#ifdef HAS_I422TORGBAROW_NEON +void I422ToRGBARow_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_rgba, + int width) { + asm volatile ( + MEMACCESS(5) + "vld1.8 {d24}, [%5] \n" + MEMACCESS(6) + "vld1.8 {d25}, [%6] \n" + "vmov.u8 d26, #128 \n" + "vmov.u16 q14, #74 \n" + "vmov.u16 q15, #16 \n" + ".p2align 2 \n" + "1: \n" + READYUV422 + YUV422TORGB + "subs %4, %4, #8 \n" + "vmov.u8 d19, #255 \n" + MEMACCESS(3) + "vst4.8 {d19, d20, d21, d22}, [%3]! \n" + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_rgba), // %3 + "+r"(width) // %4 + : "r"(&kUVToRB), // %5 + "r"(&kUVToG) // %6 + : "cc", "memory", "q0", "q1", "q2", "q3", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} +#endif // HAS_I422TORGBAROW_NEON + +#ifdef HAS_I422TORGB24ROW_NEON +void I422ToRGB24Row_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_rgb24, + int width) { + asm volatile ( + MEMACCESS(5) + "vld1.8 {d24}, [%5] \n" + MEMACCESS(6) + "vld1.8 {d25}, [%6] \n" + "vmov.u8 d26, #128 \n" + "vmov.u16 q14, #74 \n" + "vmov.u16 q15, #16 \n" + ".p2align 2 \n" + "1: \n" + READYUV422 + YUV422TORGB + "subs %4, %4, #8 \n" + MEMACCESS(3) + "vst3.8 {d20, d21, d22}, [%3]! \n" + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_rgb24), // %3 + "+r"(width) // %4 + : "r"(&kUVToRB), // %5 + "r"(&kUVToG) // %6 + : "cc", "memory", "q0", "q1", "q2", "q3", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} +#endif // HAS_I422TORGB24ROW_NEON + +#ifdef HAS_I422TORAWROW_NEON +void I422ToRAWRow_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_raw, + int width) { + asm volatile ( + MEMACCESS(5) + "vld1.8 {d24}, [%5] \n" + MEMACCESS(6) + "vld1.8 {d25}, [%6] \n" + "vmov.u8 d26, #128 \n" + "vmov.u16 q14, #74 \n" + "vmov.u16 q15, #16 \n" + ".p2align 2 \n" + "1: \n" + READYUV422 + YUV422TORGB + "subs %4, %4, #8 \n" + "vswp.u8 d20, d22 \n" + MEMACCESS(3) + "vst3.8 {d20, d21, d22}, [%3]! \n" + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_raw), // %3 + "+r"(width) // %4 + : "r"(&kUVToRB), // %5 + "r"(&kUVToG) // %6 + : "cc", "memory", "q0", "q1", "q2", "q3", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} +#endif // HAS_I422TORAWROW_NEON + +#define ARGBTORGB565 \ + "vshr.u8 d20, d20, #3 \n" /* B */ \ + "vshr.u8 d21, d21, #2 \n" /* G */ \ + "vshr.u8 d22, d22, #3 \n" /* R */ \ + "vmovl.u8 q8, d20 \n" /* B */ \ + "vmovl.u8 q9, d21 \n" /* G */ \ + "vmovl.u8 q10, d22 \n" /* R */ \ + "vshl.u16 q9, q9, #5 \n" /* G */ \ + "vshl.u16 q10, q10, #11 \n" /* R */ \ + "vorr q0, q8, q9 \n" /* BG */ \ + "vorr q0, q0, q10 \n" /* BGR */ + +#ifdef HAS_I422TORGB565ROW_NEON +void I422ToRGB565Row_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_rgb565, + int width) { + asm volatile ( + MEMACCESS(5) + "vld1.8 {d24}, [%5] \n" + MEMACCESS(6) + "vld1.8 {d25}, [%6] \n" + "vmov.u8 d26, #128 \n" + "vmov.u16 q14, #74 \n" + "vmov.u16 q15, #16 \n" + ".p2align 2 \n" + "1: \n" + READYUV422 + YUV422TORGB + "subs %4, %4, #8 \n" + ARGBTORGB565 + MEMACCESS(3) + "vst1.8 {q0}, [%3]! \n" // store 8 pixels RGB565. + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_rgb565), // %3 + "+r"(width) // %4 + : "r"(&kUVToRB), // %5 + "r"(&kUVToG) // %6 + : "cc", "memory", "q0", "q1", "q2", "q3", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} +#endif // HAS_I422TORGB565ROW_NEON + +#define ARGBTOARGB1555 \ + "vshr.u8 q10, q10, #3 \n" /* B */ \ + "vshr.u8 d22, d22, #3 \n" /* R */ \ + "vshr.u8 d23, d23, #7 \n" /* A */ \ + "vmovl.u8 q8, d20 \n" /* B */ \ + "vmovl.u8 q9, d21 \n" /* G */ \ + "vmovl.u8 q10, d22 \n" /* R */ \ + "vmovl.u8 q11, d23 \n" /* A */ \ + "vshl.u16 q9, q9, #5 \n" /* G */ \ + "vshl.u16 q10, q10, #10 \n" /* R */ \ + "vshl.u16 q11, q11, #15 \n" /* A */ \ + "vorr q0, q8, q9 \n" /* BG */ \ + "vorr q1, q10, q11 \n" /* RA */ \ + "vorr q0, q0, q1 \n" /* BGRA */ + +#ifdef HAS_I422TOARGB1555ROW_NEON +void I422ToARGB1555Row_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb1555, + int width) { + asm volatile ( + MEMACCESS(5) + "vld1.8 {d24}, [%5] \n" + MEMACCESS(6) + "vld1.8 {d25}, [%6] \n" + "vmov.u8 d26, #128 \n" + "vmov.u16 q14, #74 \n" + "vmov.u16 q15, #16 \n" + ".p2align 2 \n" + "1: \n" + READYUV422 + YUV422TORGB + "subs %4, %4, #8 \n" + "vmov.u8 d23, #255 \n" + ARGBTOARGB1555 + MEMACCESS(3) + "vst1.8 {q0}, [%3]! \n" // store 8 pixels ARGB1555. + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_argb1555), // %3 + "+r"(width) // %4 + : "r"(&kUVToRB), // %5 + "r"(&kUVToG) // %6 + : "cc", "memory", "q0", "q1", "q2", "q3", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} +#endif // HAS_I422TOARGB1555ROW_NEON + +#define ARGBTOARGB4444 \ + "vshr.u8 d20, d20, #4 \n" /* B */ \ + "vbic.32 d21, d21, d4 \n" /* G */ \ + "vshr.u8 d22, d22, #4 \n" /* R */ \ + "vbic.32 d23, d23, d4 \n" /* A */ \ + "vorr d0, d20, d21 \n" /* BG */ \ + "vorr d1, d22, d23 \n" /* RA */ \ + "vzip.u8 d0, d1 \n" /* BGRA */ + +#ifdef HAS_I422TOARGB4444ROW_NEON +void I422ToARGB4444Row_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb4444, + int width) { + asm volatile ( + MEMACCESS(5) + "vld1.8 {d24}, [%5] \n" + MEMACCESS(6) + "vld1.8 {d25}, [%6] \n" + "vmov.u8 d26, #128 \n" + "vmov.u16 q14, #74 \n" + "vmov.u16 q15, #16 \n" + "vmov.u8 d4, #0x0f \n" // bits to clear with vbic. + ".p2align 2 \n" + "1: \n" + READYUV422 + YUV422TORGB + "subs %4, %4, #8 \n" + "vmov.u8 d23, #255 \n" + ARGBTOARGB4444 + MEMACCESS(3) + "vst1.8 {q0}, [%3]! \n" // store 8 pixels ARGB4444. + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_argb4444), // %3 + "+r"(width) // %4 + : "r"(&kUVToRB), // %5 + "r"(&kUVToG) // %6 + : "cc", "memory", "q0", "q1", "q2", "q3", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} +#endif // HAS_I422TOARGB4444ROW_NEON + +#ifdef HAS_YTOARGBROW_NEON +void YToARGBRow_NEON(const uint8* src_y, + uint8* dst_argb, + int width) { + asm volatile ( + MEMACCESS(3) + "vld1.8 {d24}, [%3] \n" + MEMACCESS(4) + "vld1.8 {d25}, [%4] \n" + "vmov.u8 d26, #128 \n" + "vmov.u16 q14, #74 \n" + "vmov.u16 q15, #16 \n" + ".p2align 2 \n" + "1: \n" + READYUV400 + YUV422TORGB + "subs %2, %2, #8 \n" + "vmov.u8 d23, #255 \n" + MEMACCESS(1) + "vst4.8 {d20, d21, d22, d23}, [%1]! \n" + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "r"(&kUVToRB), // %3 + "r"(&kUVToG) // %4 + : "cc", "memory", "q0", "q1", "q2", "q3", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} +#endif // HAS_YTOARGBROW_NEON + +#ifdef HAS_I400TOARGBROW_NEON +void I400ToARGBRow_NEON(const uint8* src_y, + uint8* dst_argb, + int width) { + asm volatile ( + ".p2align 2 \n" + "vmov.u8 d23, #255 \n" + "1: \n" + MEMACCESS(0) + "vld1.8 {d20}, [%0]! \n" + "vmov d21, d20 \n" + "vmov d22, d20 \n" + "subs %2, %2, #8 \n" + MEMACCESS(1) + "vst4.8 {d20, d21, d22, d23}, [%1]! \n" + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "d20", "d21", "d22", "d23" + ); +} +#endif // HAS_I400TOARGBROW_NEON + +#ifdef HAS_NV12TOARGBROW_NEON +void NV12ToARGBRow_NEON(const uint8* src_y, + const uint8* src_uv, + uint8* dst_argb, + int width) { + asm volatile ( + MEMACCESS(4) + "vld1.8 {d24}, [%4] \n" + MEMACCESS(5) + "vld1.8 {d25}, [%5] \n" + "vmov.u8 d26, #128 \n" + "vmov.u16 q14, #74 \n" + "vmov.u16 q15, #16 \n" + ".p2align 2 \n" + "1: \n" + READNV12 + YUV422TORGB + "subs %3, %3, #8 \n" + "vmov.u8 d23, #255 \n" + MEMACCESS(2) + "vst4.8 {d20, d21, d22, d23}, [%2]! \n" + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_uv), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : "r"(&kUVToRB), // %4 + "r"(&kUVToG) // %5 + : "cc", "memory", "q0", "q1", "q2", "q3", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} +#endif // HAS_NV12TOARGBROW_NEON + +#ifdef HAS_NV21TOARGBROW_NEON +void NV21ToARGBRow_NEON(const uint8* src_y, + const uint8* src_uv, + uint8* dst_argb, + int width) { + asm volatile ( + MEMACCESS(4) + "vld1.8 {d24}, [%4] \n" + MEMACCESS(5) + "vld1.8 {d25}, [%5] \n" + "vmov.u8 d26, #128 \n" + "vmov.u16 q14, #74 \n" + "vmov.u16 q15, #16 \n" + ".p2align 2 \n" + "1: \n" + READNV21 + YUV422TORGB + "subs %3, %3, #8 \n" + "vmov.u8 d23, #255 \n" + MEMACCESS(2) + "vst4.8 {d20, d21, d22, d23}, [%2]! \n" + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_uv), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : "r"(&kUVToRB), // %4 + "r"(&kUVToG) // %5 + : "cc", "memory", "q0", "q1", "q2", "q3", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} +#endif // HAS_NV21TOARGBROW_NEON + +#ifdef HAS_NV12TORGB565ROW_NEON +void NV12ToRGB565Row_NEON(const uint8* src_y, + const uint8* src_uv, + uint8* dst_rgb565, + int width) { + asm volatile ( + MEMACCESS(4) + "vld1.8 {d24}, [%4] \n" + MEMACCESS(5) + "vld1.8 {d25}, [%5] \n" + "vmov.u8 d26, #128 \n" + "vmov.u16 q14, #74 \n" + "vmov.u16 q15, #16 \n" + ".p2align 2 \n" + "1: \n" + READNV12 + YUV422TORGB + "subs %3, %3, #8 \n" + ARGBTORGB565 + MEMACCESS(2) + "vst1.8 {q0}, [%2]! \n" // store 8 pixels RGB565. + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_uv), // %1 + "+r"(dst_rgb565), // %2 + "+r"(width) // %3 + : "r"(&kUVToRB), // %4 + "r"(&kUVToG) // %5 + : "cc", "memory", "q0", "q1", "q2", "q3", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} +#endif // HAS_NV12TORGB565ROW_NEON + +#ifdef HAS_NV21TORGB565ROW_NEON +void NV21ToRGB565Row_NEON(const uint8* src_y, + const uint8* src_uv, + uint8* dst_rgb565, + int width) { + asm volatile ( + MEMACCESS(4) + "vld1.8 {d24}, [%4] \n" + MEMACCESS(5) + "vld1.8 {d25}, [%5] \n" + "vmov.u8 d26, #128 \n" + "vmov.u16 q14, #74 \n" + "vmov.u16 q15, #16 \n" + ".p2align 2 \n" + "1: \n" + READNV21 + YUV422TORGB + "subs %3, %3, #8 \n" + ARGBTORGB565 + MEMACCESS(2) + "vst1.8 {q0}, [%2]! \n" // store 8 pixels RGB565. + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_uv), // %1 + "+r"(dst_rgb565), // %2 + "+r"(width) // %3 + : "r"(&kUVToRB), // %4 + "r"(&kUVToG) // %5 + : "cc", "memory", "q0", "q1", "q2", "q3", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} +#endif // HAS_NV21TORGB565ROW_NEON + +#ifdef HAS_YUY2TOARGBROW_NEON +void YUY2ToARGBRow_NEON(const uint8* src_yuy2, + uint8* dst_argb, + int width) { + asm volatile ( + MEMACCESS(3) + "vld1.8 {d24}, [%3] \n" + MEMACCESS(4) + "vld1.8 {d25}, [%4] \n" + "vmov.u8 d26, #128 \n" + "vmov.u16 q14, #74 \n" + "vmov.u16 q15, #16 \n" + ".p2align 2 \n" + "1: \n" + READYUY2 + YUV422TORGB + "subs %2, %2, #8 \n" + "vmov.u8 d23, #255 \n" + MEMACCESS(1) + "vst4.8 {d20, d21, d22, d23}, [%1]! \n" + "bgt 1b \n" + : "+r"(src_yuy2), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "r"(&kUVToRB), // %3 + "r"(&kUVToG) // %4 + : "cc", "memory", "q0", "q1", "q2", "q3", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} +#endif // HAS_YUY2TOARGBROW_NEON + +#ifdef HAS_UYVYTOARGBROW_NEON +void UYVYToARGBRow_NEON(const uint8* src_uyvy, + uint8* dst_argb, + int width) { + asm volatile ( + MEMACCESS(3) + "vld1.8 {d24}, [%3] \n" + MEMACCESS(4) + "vld1.8 {d25}, [%4] \n" + "vmov.u8 d26, #128 \n" + "vmov.u16 q14, #74 \n" + "vmov.u16 q15, #16 \n" + ".p2align 2 \n" + "1: \n" + READUYVY + YUV422TORGB + "subs %2, %2, #8 \n" + "vmov.u8 d23, #255 \n" + MEMACCESS(1) + "vst4.8 {d20, d21, d22, d23}, [%1]! \n" + "bgt 1b \n" + : "+r"(src_uyvy), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "r"(&kUVToRB), // %3 + "r"(&kUVToG) // %4 + : "cc", "memory", "q0", "q1", "q2", "q3", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} +#endif // HAS_UYVYTOARGBROW_NEON + +// Reads 16 pairs of UV and write even values to dst_u and odd to dst_v. +#ifdef HAS_SPLITUVROW_NEON +void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, + int width) { + asm volatile ( + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "ld2 {v0.16b, v1.16b}, [%0], #32 \n" // load 16 pairs of UV + "subs %3, %3, #16 \n" // 16 processed per loop + MEMACCESS(1) + "st1 {v0.16b}, [%1], #16 \n" // store U + MEMACCESS(2) + "st1 {v1.16b}, [%2], #16 \n" // store V + "bgt 1b \n" + : "+r"(src_uv), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(width) // %3 // Output registers + : // Input registers + : "cc", "memory", "v0", "v1" // Clobber List + ); +} +#endif // HAS_SPLITUVROW_NEON + +// Reads 16 U's and V's and writes out 16 pairs of UV. +#ifdef HAS_MERGEUVROW_NEON +void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv, + int width) { + asm volatile ( + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "ld1 {v0.16b}, [%0], #16 \n" // load U + MEMACCESS(1) + "ld1 {v1.16b}, [%1], #16 \n" // load V + "subs %3, %3, #16 \n" // 16 processed per loop + MEMACCESS(2) + "st2 {v0.16b, v1.16b}, [%2], #32 \n" // store 16 pairs of UV + "bgt 1b \n" + : + "+r"(src_u), // %0 + "+r"(src_v), // %1 + "+r"(dst_uv), // %2 + "+r"(width) // %3 // Output registers + : // Input registers + : "cc", "memory", "v0", "v1" // Clobber List + ); +} +#endif // HAS_MERGEUVROW_NEON + +// Copy multiple of 32. vld4.8 allow unaligned and is fastest on a15. +#ifdef HAS_COPYROW_NEON +void CopyRow_NEON(const uint8* src, uint8* dst, int count) { + asm volatile ( + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "ld1 {v0.8b-v3.8b}, [%0], #32 \n" // load 32 + "subs %2, %2, #32 \n" // 32 processed per loop + MEMACCESS(1) + "st1 {v0.8b-v3.8b}, [%1], #32 \n" // store 32 + "bgt 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(count) // %2 // Output registers + : // Input registers + : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List + ); +} +#endif // HAS_COPYROW_NEON + +// SetRow8 writes 'count' bytes using a 32 bit value repeated. +#ifdef HAS_SETROW_NEON +void SetRow_NEON(uint8* dst, uint32 v32, int count) { + asm volatile ( + "dup v0.4s, %w2 \n" // duplicate 4 ints + "1: \n" + "subs %1, %1, #16 \n" // 16 bytes per loop + MEMACCESS(0) + "st1 {v0.16b}, [%0], #16 \n" // store + "bgt 1b \n" + : "+r"(dst), // %0 + "+r"(count) // %1 + : "r"(v32) // %2 + : "cc", "memory", "v0" + ); +} +#endif // HAS_SETROW_NEON + +// TODO(fbarchard): Make fully assembler +// SetRow32 writes 'count' words using a 32 bit value repeated. +#ifdef HAS_ARGBSETROWS_NEON +void ARGBSetRows_NEON(uint8* dst, uint32 v32, int width, + int dst_stride, int height) { + for (int y = 0; y < height; ++y) { + SetRow_NEON(dst, v32, width << 2); + dst += dst_stride; + } +} +#endif // HAS_ARGBSETROWS_NEON + +#ifdef HAS_MIRRORROW_NEON +void MirrorRow_NEON(const uint8* src, uint8* dst, int width) { + asm volatile ( + // Start at end of source row. + "add %0, %0, %2 \n" + "sub %0, %0, #16 \n" + + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "ld1 {v0.16b}, [%0], %3 \n" // src -= 16 + "subs %2, %2, #16 \n" // 16 pixels per loop. + "rev64 v0.16b, v0.16b \n" + MEMACCESS(1) + "st1 {v0.D}[1], [%1], #8 \n" // dst += 16 + MEMACCESS(1) + "st1 {v0.D}[0], [%1], #8 \n" + "bgt 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "r"((ptrdiff_t)-16) // %3 + : "cc", "memory", "v0" + ); +} +#endif // HAS_MIRRORROW_NEON + +#ifdef HAS_MIRRORUVROW_NEON +void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, + int width) { + asm volatile ( + // Start at end of source row. + "add %0, %0, %3, lsl #1 \n" + "sub %0, %0, #16 \n" + + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "ld2 {v0.8b, v1.8b}, [%0], %4 \n" // src -= 16 + "subs %3, %3, #8 \n" // 8 pixels per loop. + "rev64 v0.8b, v0.8b \n" + "rev64 v1.8b, v1.8b \n" + MEMACCESS(1) + "st1 {v0.8b}, [%1], #8 \n" // dst += 8 + MEMACCESS(2) + "st1 {v1.8b}, [%2], #8 \n" + "bgt 1b \n" + : "+r"(src_uv), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(width) // %3 + : "r"((ptrdiff_t)-16) // %4 + : "cc", "memory", "v0", "v1" + ); +} +#endif // HAS_MIRRORUVROW_NEON + +#ifdef HAS_ARGBMIRRORROW_NEON +void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) { + asm volatile ( + // Start at end of source row. + "add %0, %0, %2, lsl #2 \n" + "sub %0, %0, #16 \n" + + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "ld1 {v0.16b}, [%0], %3 \n" // src -= 16 + "subs %2, %2, #4 \n" // 4 pixels per loop. + "rev64 v0.4s, v0.4s \n" + MEMACCESS(1) + "st1 {v0.D}[1], [%1], #8 \n" // dst += 16 + MEMACCESS(1) + "st1 {v0.D}[0], [%1], #8 \n" + "bgt 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "r"((ptrdiff_t)-16) // %3 + : "cc", "memory", "v0" + ); +} +#endif // HAS_ARGBMIRRORROW_NEON + +#ifdef HAS_RGB24TOARGBROW_NEON +void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix) { + asm volatile ( + "movi v4.8b, #255 \n" // Alpha + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "ld3 {v1.8b-v3.8b}, [%0], #24 \n" // load 8 pixels of RGB24. + "subs %2, %2, #8 \n" // 8 processed per loop. + MEMACCESS(1) + "st4 {v1.8b-v4.8b}, [%1], #32 \n" // store 8 pixels of ARGB. + "bgt 1b \n" + : "+r"(src_rgb24), // %0 + "+r"(dst_argb), // %1 + "+r"(pix) // %2 + : + : "cc", "memory", "v1", "v2", "v3", "v4" // Clobber List + ); +} +#endif // HAS_RGB24TOARGBROW_NEON + +#ifdef HAS_RAWTOARGBROW_NEON +void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int pix) { + asm volatile ( + "movi v5.8b, #255 \n" // Alpha + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "ld3 {v0.8b-v2.8b}, [%0], #24 \n" // read r g b + "subs %2, %2, #8 \n" // 8 processed per loop. + "mov v3.8b, v1.8b \n" // move g + "mov v4.8b, v0.8b \n" // move r + MEMACCESS(1) + "st4 {v2.8b-v5.8b}, [%1], #32 \n" // store b g r a + "bgt 1b \n" + : "+r"(src_raw), // %0 + "+r"(dst_argb), // %1 + "+r"(pix) // %2 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5" // Clobber List + ); +} +#endif // HAS_RAWTOARGBROW_NEON + +#define RGB565TOARGB \ + "vshrn.u16 d6, q0, #5 \n" /* G xxGGGGGG */ \ + "vuzp.u8 d0, d1 \n" /* d0 xxxBBBBB RRRRRxxx */ \ + "vshl.u8 d6, d6, #2 \n" /* G GGGGGG00 upper 6 */ \ + "vshr.u8 d1, d1, #3 \n" /* R 000RRRRR lower 5 */ \ + "vshl.u8 q0, q0, #3 \n" /* B,R BBBBB000 upper 5 */ \ + "vshr.u8 q2, q0, #5 \n" /* B,R 00000BBB lower 3 */ \ + "vorr.u8 d0, d0, d4 \n" /* B */ \ + "vshr.u8 d4, d6, #6 \n" /* G 000000GG lower 2 */ \ + "vorr.u8 d2, d1, d5 \n" /* R */ \ + "vorr.u8 d1, d4, d6 \n" /* G */ + +#ifdef HAS_RGB565TOARGBROW_NEON +void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int pix) { + asm volatile ( + "vmov.u8 d3, #255 \n" // Alpha + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels. + "subs %2, %2, #8 \n" // 8 processed per loop. + RGB565TOARGB + MEMACCESS(1) + "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB. + "bgt 1b \n" + : "+r"(src_rgb565), // %0 + "+r"(dst_argb), // %1 + "+r"(pix) // %2 + : + : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List + ); +} +#endif // HAS_RGB565TOARGBROW_NEON + +#define ARGB1555TOARGB \ + "vshrn.u16 d7, q0, #8 \n" /* A Arrrrrxx */ \ + "vshr.u8 d6, d7, #2 \n" /* R xxxRRRRR */ \ + "vshrn.u16 d5, q0, #5 \n" /* G xxxGGGGG */ \ + "vmovn.u16 d4, q0 \n" /* B xxxBBBBB */ \ + "vshr.u8 d7, d7, #7 \n" /* A 0000000A */ \ + "vneg.s8 d7, d7 \n" /* A AAAAAAAA upper 8 */ \ + "vshl.u8 d6, d6, #3 \n" /* R RRRRR000 upper 5 */ \ + "vshr.u8 q1, q3, #5 \n" /* R,A 00000RRR lower 3 */ \ + "vshl.u8 q0, q2, #3 \n" /* B,G BBBBB000 upper 5 */ \ + "vshr.u8 q2, q0, #5 \n" /* B,G 00000BBB lower 3 */ \ + "vorr.u8 q1, q1, q3 \n" /* R,A */ \ + "vorr.u8 q0, q0, q2 \n" /* B,G */ \ + +// RGB555TOARGB is same as ARGB1555TOARGB but ignores alpha. +#define RGB555TOARGB \ + "vshrn.u16 d6, q0, #5 \n" /* G xxxGGGGG */ \ + "vuzp.u8 d0, d1 \n" /* d0 xxxBBBBB xRRRRRxx */ \ + "vshl.u8 d6, d6, #3 \n" /* G GGGGG000 upper 5 */ \ + "vshr.u8 d1, d1, #2 \n" /* R 00xRRRRR lower 5 */ \ + "vshl.u8 q0, q0, #3 \n" /* B,R BBBBB000 upper 5 */ \ + "vshr.u8 q2, q0, #5 \n" /* B,R 00000BBB lower 3 */ \ + "vorr.u8 d0, d0, d4 \n" /* B */ \ + "vshr.u8 d4, d6, #5 \n" /* G 00000GGG lower 3 */ \ + "vorr.u8 d2, d1, d5 \n" /* R */ \ + "vorr.u8 d1, d4, d6 \n" /* G */ + +#ifdef HAS_ARGB1555TOARGBROW_NEON +void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, uint8* dst_argb, + int pix) { + asm volatile ( + "vmov.u8 d3, #255 \n" // Alpha + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels. + "subs %2, %2, #8 \n" // 8 processed per loop. + ARGB1555TOARGB + MEMACCESS(1) + "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB. + "bgt 1b \n" + : "+r"(src_argb1555), // %0 + "+r"(dst_argb), // %1 + "+r"(pix) // %2 + : + : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List + ); +} +#endif // HAS_ARGB1555TOARGBROW_NEON + +#define ARGB4444TOARGB \ + "vuzp.u8 d0, d1 \n" /* d0 BG, d1 RA */ \ + "vshl.u8 q2, q0, #4 \n" /* B,R BBBB0000 */ \ + "vshr.u8 q1, q0, #4 \n" /* G,A 0000GGGG */ \ + "vshr.u8 q0, q2, #4 \n" /* B,R 0000BBBB */ \ + "vorr.u8 q0, q0, q2 \n" /* B,R BBBBBBBB */ \ + "vshl.u8 q2, q1, #4 \n" /* G,A GGGG0000 */ \ + "vorr.u8 q1, q1, q2 \n" /* G,A GGGGGGGG */ \ + "vswp.u8 d1, d2 \n" /* B,R,G,A -> B,G,R,A */ + +#ifdef HAS_ARGB4444TOARGBROW_NEON +void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, uint8* dst_argb, + int pix) { + asm volatile ( + "vmov.u8 d3, #255 \n" // Alpha + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels. + "subs %2, %2, #8 \n" // 8 processed per loop. + ARGB4444TOARGB + MEMACCESS(1) + "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB. + "bgt 1b \n" + : "+r"(src_argb4444), // %0 + "+r"(dst_argb), // %1 + "+r"(pix) // %2 + : + : "cc", "memory", "q0", "q1", "q2" // Clobber List + ); +} +#endif // HAS_ARGB4444TOARGBROW_NEON + +#ifdef HAS_ARGBTORGB24ROW_NEON +void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int pix) { + asm volatile ( + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "ld4 {v1.8b-v4.8b}, [%0], #32 \n" // load 8 pixels of ARGB. + "subs %2, %2, #8 \n" // 8 processed per loop. + MEMACCESS(1) + "st3 {v1.8b-v3.8b}, [%1], #24 \n" // store 8 pixels of RGB24. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_rgb24), // %1 + "+r"(pix) // %2 + : + : "cc", "memory", "v1", "v2", "v3", "v4" // Clobber List + ); +} +#endif // HAS_ARGBTORGB24ROW_NEON + +#ifdef HAS_ARGBTORAWROW_NEON +void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int pix) { + asm volatile ( + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "ld4 {v1.8b-v4.8b}, [%0], #32 \n" // load b g r a + "subs %2, %2, #8 \n" // 8 processed per loop. + "mov v4.8b, v2.8b \n" // mov g + "mov v5.8b, v1.8b \n" // mov b + MEMACCESS(1) + "st3 {v3.8b-v5.8b}, [%1], #24 \n" // store r g b + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_raw), // %1 + "+r"(pix) // %2 + : + : "cc", "memory", "v1", "v2", "v3", "v4", "v5" // Clobber List + ); +} +#endif // HAS_ARGBTORAWROW_NEON + +#ifdef HAS_YUY2TOYROW_NEON +void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int pix) { + asm volatile ( + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "ld2 {v0.16b, v1.16b}, [%0], #32 \n" // load 16 pixels of YUY2. + "subs %2, %2, #16 \n" // 16 processed per loop. + MEMACCESS(1) + "st1 {v0.16b}, [%1], #16 \n" // store 16 pixels of Y. + "bgt 1b \n" + : "+r"(src_yuy2), // %0 + "+r"(dst_y), // %1 + "+r"(pix) // %2 + : + : "cc", "memory", "v0", "v1" // Clobber List + ); +} +#endif // HAS_YUY2TOYROW_NEON + +#ifdef HAS_UYVYTOYROW_NEON +void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int pix) { + asm volatile ( + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "ld2 {v0.16b, v1.16b}, [%0], #32 \n" // load 16 pixels of UYVY. + "subs %2, %2, #16 \n" // 16 processed per loop. + MEMACCESS(1) + "st1 {v1.16b}, [%1], #16 \n" // store 16 pixels of Y. + "bgt 1b \n" + : "+r"(src_uyvy), // %0 + "+r"(dst_y), // %1 + "+r"(pix) // %2 + : + : "cc", "memory", "v0", "v1" // Clobber List + ); +} +#endif // HAS_UYVYTOYROW_NEON + +#ifdef HAS_YUY2TOUV422ROW_NEON +void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v, + int pix) { + asm volatile ( + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 16 pixels of YUY2. + "subs %3, %3, #16 \n" // 16 pixels = 8 UVs. + MEMACCESS(1) + "st1 {v1.8b}, [%1], #8 \n" // store 8 U. + MEMACCESS(2) + "st1 {v3.8b}, [%2], #8 \n" // store 8 V. + "bgt 1b \n" + : "+r"(src_yuy2), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(pix) // %3 + : + : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List + ); +} +#endif // HAS_YUY2TOUV422ROW_NEON + +#ifdef HAS_UYVYTOUV422ROW_NEON +void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v, + int pix) { + asm volatile ( + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 16 pixels of UYVY. + "subs %3, %3, #16 \n" // 16 pixels = 8 UVs. + MEMACCESS(1) + "st1 {v0.8b}, [%1], #8 \n" // store 8 U. + MEMACCESS(2) + "st1 {v2.8b}, [%2], #8 \n" // store 8 V. + "bgt 1b \n" + : "+r"(src_uyvy), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(pix) // %3 + : + : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List + ); +} +#endif // HAS_UYVYTOUV422ROW_NEON + +#ifdef HAS_YUY2TOUVROW_NEON +void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2, + uint8* dst_u, uint8* dst_v, int pix) { + asm volatile ( + "add %x1, %x0, %w1, sxtw \n" // stride + src_yuy2 + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 16 pixels of YUY2. + "subs %4, %4, #16 \n" // 16 pixels = 8 UVs. + MEMACCESS(1) + "ld4 {v4.8b-v7.8b}, [%1], #32 \n" // load next row YUY2. + "urhadd v1.8b, v1.8b, v5.8b \n" // average rows of U + "urhadd v3.8b, v3.8b, v7.8b \n" // average rows of V + MEMACCESS(2) + "st1 {v1.8b}, [%2], #8 \n" // store 8 U. + MEMACCESS(3) + "st1 {v3.8b}, [%3], #8 \n" // store 8 V. + "bgt 1b \n" + : "+r"(src_yuy2), // %0 + "+r"(stride_yuy2), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(pix) // %4 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" // Clobber List + ); +} +#endif // HAS_YUY2TOUVROW_NEON + +#ifdef HAS_UYVYTOUVROW_NEON +void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy, + uint8* dst_u, uint8* dst_v, int pix) { + asm volatile ( + "add %x1, %x0, %w1, sxtw \n" // stride + src_uyvy + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 16 pixels of UYVY. + "subs %4, %4, #16 \n" // 16 pixels = 8 UVs. + MEMACCESS(1) + "ld4 {v4.8b-v7.8b}, [%1], #32 \n" // load next row UYVY. + "urhadd v0.8b, v0.8b, v4.8b \n" // average rows of U + "urhadd v2.8b, v2.8b, v6.8b \n" // average rows of V + MEMACCESS(2) + "st1 {v0.8b}, [%2], #8 \n" // store 8 U. + MEMACCESS(3) + "st1 {v2.8b}, [%3], #8 \n" // store 8 V. + "bgt 1b \n" + : "+r"(src_uyvy), // %0 + "+r"(stride_uyvy), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(pix) // %4 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" // Clobber List + ); +} +#endif // HAS_UYVYTOUVROW_NEON + +#ifdef HAS_HALFROW_NEON +void HalfRow_NEON(const uint8* src_uv, int src_uv_stride, + uint8* dst_uv, int pix) { + asm volatile ( + // change the stride to row 2 pointer + "add %x1, %x0, %w1, sxtw \n" + "1: \n" + MEMACCESS(0) + "ld1 {v0.16b}, [%0], #16 \n" // load row 1 16 pixels. + "subs %3, %3, #16 \n" // 16 processed per loop + MEMACCESS(1) + "ld1 {v1.16b}, [%1], #16 \n" // load row 2 16 pixels. + "urhadd v0.16b, v0.16b, v1.16b \n" // average row 1 and 2 + MEMACCESS(2) + "st1 {v0.16b}, [%2], #16 \n" + "bgt 1b \n" + : "+r"(src_uv), // %0 + "+r"(src_uv_stride), // %1 + "+r"(dst_uv), // %2 + "+r"(pix) // %3 + : + : "cc", "memory", "v0", "v1" // Clobber List + ); +} +#endif // HAS_HALFROW_NEON + +// Select 2 channels from ARGB on alternating pixels. e.g. BGBGBGBG +#ifdef HAS_ARGBTOBAYERROW_NEON +void ARGBToBayerRow_NEON(const uint8* src_argb, uint8* dst_bayer, + uint32 selector, int pix) { + asm volatile ( + "mov v2.s[0], %w3 \n" // selector + "1: \n" + MEMACCESS(0) + "ld1 {v0.16b, v1.16b}, [%0], 32 \n" // load row 8 pixels. + "subs %2, %2, #8 \n" // 8 processed per loop + "tbl v4.8b, {v0.16b}, v2.8b \n" // look up 4 pixels + "tbl v5.8b, {v1.16b}, v2.8b \n" // look up 4 pixels + "trn1 v4.4s, v4.4s, v5.4s \n" // combine 8 pixels + MEMACCESS(1) + "st1 {v4.8b}, [%1], #8 \n" // store 8. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_bayer), // %1 + "+r"(pix) // %2 + : "r"(selector) // %3 + : "cc", "memory", "v0", "v1", "v2", "v4", "v5" // Clobber List + ); +} +#endif // HAS_ARGBTOBAYERROW_NEON + +// Select G channels from ARGB. e.g. GGGGGGGG +#ifdef HAS_ARGBTOBAYERGGROW_NEON +void ARGBToBayerGGRow_NEON(const uint8* src_argb, uint8* dst_bayer, + uint32 /*selector*/, int pix) { + asm volatile ( + "1: \n" + MEMACCESS(0) + "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load row 8 pixels. + "subs %2, %2, #8 \n" // 8 processed per loop + MEMACCESS(1) + "st1 {v1.8b}, [%1], #8 \n" // store 8 G's. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_bayer), // %1 + "+r"(pix) // %2 + : + : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List + ); +} +#endif // HAS_ARGBTOBAYERGGROW_NEON + +// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. +#ifdef HAS_ARGBSHUFFLEROW_NEON +void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb, + const uint8* shuffler, int pix) { + asm volatile ( + MEMACCESS(3) + "ld1 {v2.16b}, [%3] \n" // shuffler + "1: \n" + MEMACCESS(0) + "ld1 {v0.16b}, [%0], #16 \n" // load 4 pixels. + "subs %2, %2, #4 \n" // 4 processed per loop + "tbl v1.16b, {v0.16b}, v2.16b \n" // look up 4 pixels + MEMACCESS(1) + "st1 {v1.16b}, [%1], #16 \n" // store 4. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(pix) // %2 + : "r"(shuffler) // %3 + : "cc", "memory", "v0", "v1", "v2" // Clobber List + ); +} +#endif // HAS_ARGBSHUFFLEROW_NEON + +#ifdef HAS_I422TOYUY2ROW_NEON +void I422ToYUY2Row_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_yuy2, int width) { + asm volatile ( + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "ld2 {v0.8b, v1.8b}, [%0], #16 \n" // load 16 Ys + "mov v2.8b, v1.8b \n" + MEMACCESS(1) + "ld1 {v1.8b}, [%1], #8 \n" // load 8 Us + MEMACCESS(2) + "ld1 {v3.8b}, [%2], #8 \n" // load 8 Vs + "subs %4, %4, #16 \n" // 16 pixels + MEMACCESS(3) + "st4 {v0.8b-v3.8b}, [%3], #32 \n" // Store 8 YUY2/16 pixels. + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_yuy2), // %3 + "+r"(width) // %4 + : + : "cc", "memory", "v0", "v1", "v2", "v3" + ); +} +#endif // HAS_I422TOYUY2ROW_NEON + +#ifdef HAS_I422TOUYVYROW_NEON +void I422ToUYVYRow_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_uyvy, int width) { + asm volatile ( + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "ld2 {v1.8b, v2.8b}, [%0], #16 \n" // load 16 Ys + "mov v3.8b, v2.8b \n" + MEMACCESS(1) + "ld1 {v0.8b}, [%1], #8 \n" // load 8 Us + MEMACCESS(2) + "ld1 {v2.8b}, [%2], #8 \n" // load 8 Vs + "subs %4, %4, #16 \n" // 16 pixels + MEMACCESS(3) + "st4 {v0.8b-v3.8b}, [%3], #32 \n" // Store 8 UYVY/16 pixels. + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_uyvy), // %3 + "+r"(width) // %4 + : + : "cc", "memory", "v0", "v1", "v2", "v3" + ); +} +#endif // HAS_I422TOUYVYROW_NEON + +#ifdef HAS_ARGBTORGB565ROW_NEON +void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int pix) { + asm volatile ( + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB. + "subs %2, %2, #8 \n" // 8 processed per loop. + ARGBTORGB565 + MEMACCESS(1) + "vst1.8 {q0}, [%1]! \n" // store 8 pixels RGB565. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_rgb565), // %1 + "+r"(pix) // %2 + : + : "cc", "memory", "q0", "q8", "q9", "q10", "q11" + ); +} +#endif // HAS_ARGBTORGB565ROW_NEON + +#ifdef HAS_ARGBTOARGB1555ROW_NEON +void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_argb1555, + int pix) { + asm volatile ( + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB. + "subs %2, %2, #8 \n" // 8 processed per loop. + ARGBTOARGB1555 + MEMACCESS(1) + "vst1.8 {q0}, [%1]! \n" // store 8 pixels ARGB1555. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb1555), // %1 + "+r"(pix) // %2 + : + : "cc", "memory", "q0", "q8", "q9", "q10", "q11" + ); +} +#endif // HAS_ARGBTOARGB1555ROW_NEON + +#ifdef HAS_ARGBTOARGB4444ROW_NEON +void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_argb4444, + int pix) { + asm volatile ( + "vmov.u8 d4, #0x0f \n" // bits to clear with vbic. + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB. + "subs %2, %2, #8 \n" // 8 processed per loop. + ARGBTOARGB4444 + MEMACCESS(1) + "vst1.8 {q0}, [%1]! \n" // store 8 pixels ARGB4444. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb4444), // %1 + "+r"(pix) // %2 + : + : "cc", "memory", "q0", "q8", "q9", "q10", "q11" + ); +} +#endif // HAS_ARGBTOARGB4444ROW_NEON + +#ifdef HAS_ARGBTOYROW_NEON +void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) { + asm volatile ( + "movi v4.8b, #13 \n" // B * 0.1016 coefficient + "movi v5.8b, #65 \n" // G * 0.5078 coefficient + "movi v6.8b, #33 \n" // R * 0.2578 coefficient + "movi v7.8b, #16 \n" // Add 16 constant + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. + "subs %2, %2, #8 \n" // 8 processed per loop. + "umull v3.8h, v0.8b, v4.8b \n" // B + "umlal v3.8h, v1.8b, v5.8b \n" // G + "umlal v3.8h, v2.8b, v6.8b \n" // R + "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y + "uqadd v0.8b, v0.8b, v7.8b \n" + MEMACCESS(1) + "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_y), // %1 + "+r"(pix) // %2 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" + ); +} +#endif // HAS_ARGBTOYROW_NEON + +#ifdef HAS_ARGBTOYJROW_NEON +void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) { + asm volatile ( + "movi v4.8b, #15 \n" // B * 0.11400 coefficient + "movi v5.8b, #75 \n" // G * 0.58700 coefficient + "movi v6.8b, #38 \n" // R * 0.29900 coefficient + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. + "subs %2, %2, #8 \n" // 8 processed per loop. + "umull v3.8h, v0.8b, v4.8b \n" // B + "umlal v3.8h, v1.8b, v5.8b \n" // G + "umlal v3.8h, v2.8b, v6.8b \n" // R + "sqrshrun v0.8b, v3.8h, #7 \n" // 15 bit to 8 bit Y + MEMACCESS(1) + "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_y), // %1 + "+r"(pix) // %2 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6" + ); +} +#endif // HAS_ARGBTOYJROW_NEON + +// 8x1 pixels. +#ifdef HAS_ARGBTOUV444ROW_NEON +void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, + int pix) { + asm volatile ( + "movi v24.8b, #112 \n" // UB / VR 0.875 coefficient + "movi v25.8b, #74 \n" // UG -0.5781 coefficient + "movi v26.8b, #38 \n" // UR -0.2969 coefficient + "movi v27.8b, #18 \n" // VB -0.1406 coefficient + "movi v28.8b, #94 \n" // VG -0.7344 coefficient + "movi v29.16b,#0x80 \n" // 128.5 + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. + "subs %3, %3, #8 \n" // 8 processed per loop. + "umull v4.8h, v0.8b, v24.8b \n" // B + "umlsl v4.8h, v1.8b, v25.8b \n" // G + "umlsl v4.8h, v2.8b, v26.8b \n" // R + "add v4.8h, v4.8h, v29.8h \n" // +128 -> unsigned + + "umull v3.8h, v2.8b, v24.8b \n" // R + "umlsl v3.8h, v1.8b, v28.8b \n" // G + "umlsl v3.8h, v0.8b, v27.8b \n" // B + "add v3.8h, v3.8h, v29.8h \n" // +128 -> unsigned + + "uqshrn v0.8b, v4.8h, #8 \n" // 16 bit to 8 bit U + "uqshrn v1.8b, v3.8h, #8 \n" // 16 bit to 8 bit V + + MEMACCESS(1) + "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels U. + MEMACCESS(2) + "st1 {v1.8b}, [%2], #8 \n" // store 8 pixels V. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(pix) // %3 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", + "v24", "v25", "v26", "v27", "v28", "v29" + ); +} +#endif // HAS_ARGBTOUV444ROW_NEON + +// 16x1 pixels -> 8x1. pix is number of argb pixels. e.g. 16. +#ifdef HAS_ARGBTOUV422ROW_NEON +void ARGBToUV422Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, + int pix) { + asm volatile ( + "movi v20.8h, #112 / 2 \n" // UB / VR 0.875 coefficient + "movi v21.8h, #74 / 2 \n" // UG -0.5781 coefficient + "movi v22.8h, #38 / 2 \n" // UR -0.2969 coefficient + "movi v23.8h, #18 / 2 \n" // VB -0.1406 coefficient + "movi v24.8h, #94 / 2 \n" // VG -0.7344 coefficient + "movi v25.16b, #0x80 \n" // 128.5 + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "ld4 {v0.16b-v3.16b}, [%0], #64 \n" // load 16 ARGB pixels. + + "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. + "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. + "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. + + "subs %3, %3, #16 \n" // 16 processed per loop. + "mul v3.8h, v0.8h, v20.8h \n" // B + "mls v3.8h, v1.8h, v21.8h \n" // G + "mls v3.8h, v2.8h, v22.8h \n" // R + "add v3.8h, v3.8h, v25.8h \n" // +128 -> unsigned + + "mul v4.8h, v2.8h, v20.8h \n" // R + "mls v4.8h, v1.8h, v24.8h \n" // G + "mls v4.8h, v0.8h, v23.8h \n" // B + "add v4.8h, v4.8h, v25.8h \n" // +128 -> unsigned + + "uqshrn v0.8b, v3.8h, #8 \n" // 16 bit to 8 bit U + "uqshrn v1.8b, v4.8h, #8 \n" // 16 bit to 8 bit V + + MEMACCESS(1) + "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels U. + MEMACCESS(2) + "st1 {v1.8b}, [%2], #8 \n" // store 8 pixels V. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(pix) // %3 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v20", "v21", "v22", "v23", "v24", "v25" + ); +} +#endif // HAS_ARGBTOUV422ROW_NEON + +// 32x1 pixels -> 8x1. pix is number of argb pixels. e.g. 32. +#ifdef HAS_ARGBTOUV411ROW_NEON +void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, + int pix) { + asm volatile ( + "movi v20.8h, #112 / 2 \n" // UB / VR 0.875 coefficient + "movi v21.8h, #74 / 2 \n" // UG -0.5781 coefficient + "movi v22.8h, #38 / 2 \n" // UR -0.2969 coefficient + "movi v23.8h, #18 / 2 \n" // VB -0.1406 coefficient + "movi v24.8h, #94 / 2 \n" // VG -0.7344 coefficient + "movi v25.16b, #0x80 \n" // 128.5 + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "ld4 {v0.16b-v3.16b}, [%0], #64 \n" // load 16 ARGB pixels. + "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. + "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. + "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. + MEMACCESS(0) + "ld4 {v4.16b-v7.16b}, [%0], #64 \n" // load next 16 ARGB pixels. + "uaddlp v4.8h, v4.16b \n" // B 16 bytes -> 8 shorts. + "uaddlp v5.8h, v5.16b \n" // G 16 bytes -> 8 shorts. + "uaddlp v6.8h, v6.16b \n" // R 16 bytes -> 8 shorts. + + "addp v0.8h, v0.8h, v4.8h \n" // B 16 shorts -> 8 shorts. + "addp v1.8h, v1.8h, v5.8h \n" // G 16 shorts -> 8 shorts. + "addp v2.8h, v2.8h, v6.8h \n" // R 16 shorts -> 8 shorts. + + "urshr v0.8h, v0.8h, #1 \n" // 2x average + "urshr v1.8h, v1.8h, #1 \n" + "urshr v2.8h, v2.8h, #1 \n" + + "subs %3, %3, #32 \n" // 32 processed per loop. + "mul v3.8h, v0.8h, v20.8h \n" // B + "mls v3.8h, v1.8h, v21.8h \n" // G + "mls v3.8h, v2.8h, v22.8h \n" // R + "add v3.8h, v3.8h, v25.8h \n" // +128 -> unsigned + "mul v4.8h, v2.8h, v20.8h \n" // R + "mls v4.8h, v1.8h, v24.8h \n" // G + "mls v4.8h, v0.8h, v23.8h \n" // B + "add v4.8h, v4.8h, v25.8h \n" // +128 -> unsigned + "uqshrn v0.8b, v3.8h, #8 \n" // 16 bit to 8 bit U + "uqshrn v1.8b, v4.8h, #8 \n" // 16 bit to 8 bit V + MEMACCESS(1) + "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels U. + MEMACCESS(2) + "st1 {v1.8b}, [%2], #8 \n" // store 8 pixels V. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(pix) // %3 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v20", "v21", "v22", "v23", "v24", "v25" + ); +} +#endif // HAS_ARGBTOUV411ROW_NEON + +// 16x2 pixels -> 8x1. pix is number of argb pixels. e.g. 16. +#define RGBTOUV(QB, QG, QR) \ + "vmul.s16 q8, " #QB ", q10 \n" /* B */ \ + "vmls.s16 q8, " #QG ", q11 \n" /* G */ \ + "vmls.s16 q8, " #QR ", q12 \n" /* R */ \ + "vadd.u16 q8, q8, q15 \n" /* +128 -> unsigned */ \ + "vmul.s16 q9, " #QR ", q10 \n" /* R */ \ + "vmls.s16 q9, " #QG ", q14 \n" /* G */ \ + "vmls.s16 q9, " #QB ", q13 \n" /* B */ \ + "vadd.u16 q9, q9, q15 \n" /* +128 -> unsigned */ \ + "vqshrn.u16 d0, q8, #8 \n" /* 16 bit to 8 bit U */ \ + "vqshrn.u16 d1, q9, #8 \n" /* 16 bit to 8 bit V */ + +// TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr. +#ifdef HAS_ARGBTOUVROW_NEON +void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int pix) { + asm volatile ( + "add %1, %0, %1 \n" // src_stride + src_argb + "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient + "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient + "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient + "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient + "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient + "vmov.u16 q15, #0x8080 \n" // 128.5 + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. + MEMACCESS(0) + "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels. + "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. + "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. + "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts. + MEMACCESS(1) + "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ARGB pixels. + MEMACCESS(1) + "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ARGB pixels. + "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts. + "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. + "vpadal.u8 q2, q6 \n" // R 16 bytes -> 8 shorts. + + "vrshr.u16 q0, q0, #1 \n" // 2x average + "vrshr.u16 q1, q1, #1 \n" + "vrshr.u16 q2, q2, #1 \n" + + "subs %4, %4, #16 \n" // 32 processed per loop. + RGBTOUV(q0, q1, q2) + MEMACCESS(2) + "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. + MEMACCESS(3) + "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(src_stride_argb), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(pix) // %4 + : + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} +#endif // HAS_ARGBTOUVROW_NEON + +// TODO(fbarchard): Subsample match C code. +#ifdef HAS_ARGBTOUVJROW_NEON +void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int pix) { + asm volatile ( + "add %1, %0, %1 \n" // src_stride + src_argb + "vmov.s16 q10, #127 / 2 \n" // UB / VR 0.500 coefficient + "vmov.s16 q11, #84 / 2 \n" // UG -0.33126 coefficient + "vmov.s16 q12, #43 / 2 \n" // UR -0.16874 coefficient + "vmov.s16 q13, #20 / 2 \n" // VB -0.08131 coefficient + "vmov.s16 q14, #107 / 2 \n" // VG -0.41869 coefficient + "vmov.u16 q15, #0x8080 \n" // 128.5 + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. + MEMACCESS(0) + "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels. + "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. + "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. + "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts. + MEMACCESS(1) + "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ARGB pixels. + MEMACCESS(1) + "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ARGB pixels. + "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts. + "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. + "vpadal.u8 q2, q6 \n" // R 16 bytes -> 8 shorts. + + "vrshr.u16 q0, q0, #1 \n" // 2x average + "vrshr.u16 q1, q1, #1 \n" + "vrshr.u16 q2, q2, #1 \n" + + "subs %4, %4, #16 \n" // 32 processed per loop. + RGBTOUV(q0, q1, q2) + MEMACCESS(2) + "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. + MEMACCESS(3) + "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(src_stride_argb), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(pix) // %4 + : + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} +#endif // HAS_ARGBTOUVJROW_NEON + +#ifdef HAS_BGRATOUVROW_NEON +void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra, + uint8* dst_u, uint8* dst_v, int pix) { + asm volatile ( + "add %1, %0, %1 \n" // src_stride + src_bgra + "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient + "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient + "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient + "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient + "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient + "vmov.u16 q15, #0x8080 \n" // 128.5 + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 BGRA pixels. + MEMACCESS(0) + "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 BGRA pixels. + "vpaddl.u8 q3, q3 \n" // B 16 bytes -> 8 shorts. + "vpaddl.u8 q2, q2 \n" // G 16 bytes -> 8 shorts. + "vpaddl.u8 q1, q1 \n" // R 16 bytes -> 8 shorts. + MEMACCESS(1) + "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more BGRA pixels. + MEMACCESS(1) + "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 BGRA pixels. + "vpadal.u8 q3, q7 \n" // B 16 bytes -> 8 shorts. + "vpadal.u8 q2, q6 \n" // G 16 bytes -> 8 shorts. + "vpadal.u8 q1, q5 \n" // R 16 bytes -> 8 shorts. + + "vrshr.u16 q1, q1, #1 \n" // 2x average + "vrshr.u16 q2, q2, #1 \n" + "vrshr.u16 q3, q3, #1 \n" + + "subs %4, %4, #16 \n" // 32 processed per loop. + RGBTOUV(q3, q2, q1) + MEMACCESS(2) + "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. + MEMACCESS(3) + "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. + "bgt 1b \n" + : "+r"(src_bgra), // %0 + "+r"(src_stride_bgra), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(pix) // %4 + : + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} +#endif // HAS_BGRATOUVROW_NEON + +#ifdef HAS_ABGRTOUVROW_NEON +void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr, + uint8* dst_u, uint8* dst_v, int pix) { + asm volatile ( + "add %1, %0, %1 \n" // src_stride + src_abgr + "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient + "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient + "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient + "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient + "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient + "vmov.u16 q15, #0x8080 \n" // 128.5 + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ABGR pixels. + MEMACCESS(0) + "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ABGR pixels. + "vpaddl.u8 q2, q2 \n" // B 16 bytes -> 8 shorts. + "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. + "vpaddl.u8 q0, q0 \n" // R 16 bytes -> 8 shorts. + MEMACCESS(1) + "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ABGR pixels. + MEMACCESS(1) + "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ABGR pixels. + "vpadal.u8 q2, q6 \n" // B 16 bytes -> 8 shorts. + "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. + "vpadal.u8 q0, q4 \n" // R 16 bytes -> 8 shorts. + + "vrshr.u16 q0, q0, #1 \n" // 2x average + "vrshr.u16 q1, q1, #1 \n" + "vrshr.u16 q2, q2, #1 \n" + + "subs %4, %4, #16 \n" // 32 processed per loop. + RGBTOUV(q2, q1, q0) + MEMACCESS(2) + "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. + MEMACCESS(3) + "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. + "bgt 1b \n" + : "+r"(src_abgr), // %0 + "+r"(src_stride_abgr), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(pix) // %4 + : + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} +#endif // HAS_ABGRTOUVROW_NEON + +#ifdef HAS_RGBATOUVROW_NEON +void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba, + uint8* dst_u, uint8* dst_v, int pix) { + asm volatile ( + "add %1, %0, %1 \n" // src_stride + src_rgba + "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient + "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient + "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient + "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient + "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient + "vmov.u16 q15, #0x8080 \n" // 128.5 + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 RGBA pixels. + MEMACCESS(0) + "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 RGBA pixels. + "vpaddl.u8 q0, q1 \n" // B 16 bytes -> 8 shorts. + "vpaddl.u8 q1, q2 \n" // G 16 bytes -> 8 shorts. + "vpaddl.u8 q2, q3 \n" // R 16 bytes -> 8 shorts. + MEMACCESS(1) + "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more RGBA pixels. + MEMACCESS(1) + "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 RGBA pixels. + "vpadal.u8 q0, q5 \n" // B 16 bytes -> 8 shorts. + "vpadal.u8 q1, q6 \n" // G 16 bytes -> 8 shorts. + "vpadal.u8 q2, q7 \n" // R 16 bytes -> 8 shorts. + + "vrshr.u16 q0, q0, #1 \n" // 2x average + "vrshr.u16 q1, q1, #1 \n" + "vrshr.u16 q2, q2, #1 \n" + + "subs %4, %4, #16 \n" // 32 processed per loop. + RGBTOUV(q0, q1, q2) + MEMACCESS(2) + "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. + MEMACCESS(3) + "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. + "bgt 1b \n" + : "+r"(src_rgba), // %0 + "+r"(src_stride_rgba), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(pix) // %4 + : + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} +#endif // HAS_RGBATOUVROW_NEON + +#ifdef HAS_RGB24TOUVROW_NEON +void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24, + uint8* dst_u, uint8* dst_v, int pix) { + asm volatile ( + "add %1, %0, %1 \n" // src_stride + src_rgb24 + "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient + "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient + "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient + "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient + "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient + "vmov.u16 q15, #0x8080 \n" // 128.5 + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RGB24 pixels. + MEMACCESS(0) + "vld3.8 {d1, d3, d5}, [%0]! \n" // load next 8 RGB24 pixels. + "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. + "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. + "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts. + MEMACCESS(1) + "vld3.8 {d8, d10, d12}, [%1]! \n" // load 8 more RGB24 pixels. + MEMACCESS(1) + "vld3.8 {d9, d11, d13}, [%1]! \n" // load last 8 RGB24 pixels. + "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts. + "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. + "vpadal.u8 q2, q6 \n" // R 16 bytes -> 8 shorts. + + "vrshr.u16 q0, q0, #1 \n" // 2x average + "vrshr.u16 q1, q1, #1 \n" + "vrshr.u16 q2, q2, #1 \n" + + "subs %4, %4, #16 \n" // 32 processed per loop. + RGBTOUV(q0, q1, q2) + MEMACCESS(2) + "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. + MEMACCESS(3) + "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. + "bgt 1b \n" + : "+r"(src_rgb24), // %0 + "+r"(src_stride_rgb24), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(pix) // %4 + : + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} +#endif // HAS_RGB24TOUVROW_NEON + +#ifdef HAS_RAWTOUVROW_NEON +void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw, + uint8* dst_u, uint8* dst_v, int pix) { + asm volatile ( + "add %1, %0, %1 \n" // src_stride + src_raw + "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient + "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient + "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient + "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient + "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient + "vmov.u16 q15, #0x8080 \n" // 128.5 + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RAW pixels. + MEMACCESS(0) + "vld3.8 {d1, d3, d5}, [%0]! \n" // load next 8 RAW pixels. + "vpaddl.u8 q2, q2 \n" // B 16 bytes -> 8 shorts. + "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. + "vpaddl.u8 q0, q0 \n" // R 16 bytes -> 8 shorts. + MEMACCESS(1) + "vld3.8 {d8, d10, d12}, [%1]! \n" // load 8 more RAW pixels. + MEMACCESS(1) + "vld3.8 {d9, d11, d13}, [%1]! \n" // load last 8 RAW pixels. + "vpadal.u8 q2, q6 \n" // B 16 bytes -> 8 shorts. + "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. + "vpadal.u8 q0, q4 \n" // R 16 bytes -> 8 shorts. + + "vrshr.u16 q0, q0, #1 \n" // 2x average + "vrshr.u16 q1, q1, #1 \n" + "vrshr.u16 q2, q2, #1 \n" + + "subs %4, %4, #16 \n" // 32 processed per loop. + RGBTOUV(q2, q1, q0) + MEMACCESS(2) + "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. + MEMACCESS(3) + "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. + "bgt 1b \n" + : "+r"(src_raw), // %0 + "+r"(src_stride_raw), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(pix) // %4 + : + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} +#endif // HAS_RAWTOUVROW_NEON + +// 16x2 pixels -> 8x1. pix is number of argb pixels. e.g. 16. +#ifdef HAS_RGB565TOUVROW_NEON +void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565, + uint8* dst_u, uint8* dst_v, int pix) { + asm volatile ( + "add %1, %0, %1 \n" // src_stride + src_argb + "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient + "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient + "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient + "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient + "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient + "vmov.u16 q15, #0x8080 \n" // 128.5 + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels. + RGB565TOARGB + "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. + "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. + "vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. + MEMACCESS(0) + "vld1.8 {q0}, [%0]! \n" // next 8 RGB565 pixels. + RGB565TOARGB + "vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. + "vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts. + "vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts. + + MEMACCESS(1) + "vld1.8 {q0}, [%1]! \n" // load 8 RGB565 pixels. + RGB565TOARGB + "vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. + "vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. + "vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. + MEMACCESS(1) + "vld1.8 {q0}, [%1]! \n" // next 8 RGB565 pixels. + RGB565TOARGB + "vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. + "vpadal.u8 d11, d1 \n" // G 8 bytes -> 4 shorts. + "vpadal.u8 d13, d2 \n" // R 8 bytes -> 4 shorts. + + "vrshr.u16 q4, q4, #1 \n" // 2x average + "vrshr.u16 q5, q5, #1 \n" + "vrshr.u16 q6, q6, #1 \n" + + "subs %4, %4, #16 \n" // 16 processed per loop. + "vmul.s16 q8, q4, q10 \n" // B + "vmls.s16 q8, q5, q11 \n" // G + "vmls.s16 q8, q6, q12 \n" // R + "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned + "vmul.s16 q9, q6, q10 \n" // R + "vmls.s16 q9, q5, q14 \n" // G + "vmls.s16 q9, q4, q13 \n" // B + "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned + "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U + "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V + MEMACCESS(2) + "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. + MEMACCESS(3) + "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. + "bgt 1b \n" + : "+r"(src_rgb565), // %0 + "+r"(src_stride_rgb565), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(pix) // %4 + : + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} +#endif // HAS_RGB565TOUVROW_NEON + +// 16x2 pixels -> 8x1. pix is number of argb pixels. e.g. 16. +#ifdef HAS_ARGB1555TOUVROW_NEON +void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555, + uint8* dst_u, uint8* dst_v, int pix) { + asm volatile ( + "add %1, %0, %1 \n" // src_stride + src_argb + "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient + "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient + "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient + "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient + "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient + "vmov.u16 q15, #0x8080 \n" // 128.5 + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels. + RGB555TOARGB + "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. + "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. + "vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. + MEMACCESS(0) + "vld1.8 {q0}, [%0]! \n" // next 8 ARGB1555 pixels. + RGB555TOARGB + "vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. + "vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts. + "vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts. + + MEMACCESS(1) + "vld1.8 {q0}, [%1]! \n" // load 8 ARGB1555 pixels. + RGB555TOARGB + "vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. + "vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. + "vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. + MEMACCESS(1) + "vld1.8 {q0}, [%1]! \n" // next 8 ARGB1555 pixels. + RGB555TOARGB + "vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. + "vpadal.u8 d11, d1 \n" // G 8 bytes -> 4 shorts. + "vpadal.u8 d13, d2 \n" // R 8 bytes -> 4 shorts. + + "vrshr.u16 q4, q4, #1 \n" // 2x average + "vrshr.u16 q5, q5, #1 \n" + "vrshr.u16 q6, q6, #1 \n" + + "subs %4, %4, #16 \n" // 16 processed per loop. + "vmul.s16 q8, q4, q10 \n" // B + "vmls.s16 q8, q5, q11 \n" // G + "vmls.s16 q8, q6, q12 \n" // R + "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned + "vmul.s16 q9, q6, q10 \n" // R + "vmls.s16 q9, q5, q14 \n" // G + "vmls.s16 q9, q4, q13 \n" // B + "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned + "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U + "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V + MEMACCESS(2) + "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. + MEMACCESS(3) + "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. + "bgt 1b \n" + : "+r"(src_argb1555), // %0 + "+r"(src_stride_argb1555), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(pix) // %4 + : + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} +#endif // HAS_ARGB1555TOUVROW_NEON + +// 16x2 pixels -> 8x1. pix is number of argb pixels. e.g. 16. +#ifdef HAS_ARGB4444TOUVROW_NEON +void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, int src_stride_argb4444, + uint8* dst_u, uint8* dst_v, int pix) { + asm volatile ( + "add %1, %0, %1 \n" // src_stride + src_argb + "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient + "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient + "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient + "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient + "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient + "vmov.u16 q15, #0x8080 \n" // 128.5 + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels. + ARGB4444TOARGB + "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. + "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. + "vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. + MEMACCESS(0) + "vld1.8 {q0}, [%0]! \n" // next 8 ARGB4444 pixels. + ARGB4444TOARGB + "vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. + "vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts. + "vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts. + + MEMACCESS(1) + "vld1.8 {q0}, [%1]! \n" // load 8 ARGB4444 pixels. + ARGB4444TOARGB + "vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. + "vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. + "vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. + MEMACCESS(1) + "vld1.8 {q0}, [%1]! \n" // next 8 ARGB4444 pixels. + ARGB4444TOARGB + "vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. + "vpadal.u8 d11, d1 \n" // G 8 bytes -> 4 shorts. + "vpadal.u8 d13, d2 \n" // R 8 bytes -> 4 shorts. + + "vrshr.u16 q4, q4, #1 \n" // 2x average + "vrshr.u16 q5, q5, #1 \n" + "vrshr.u16 q6, q6, #1 \n" + + "subs %4, %4, #16 \n" // 16 processed per loop. + "vmul.s16 q8, q4, q10 \n" // B + "vmls.s16 q8, q5, q11 \n" // G + "vmls.s16 q8, q6, q12 \n" // R + "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned + "vmul.s16 q9, q6, q10 \n" // R + "vmls.s16 q9, q5, q14 \n" // G + "vmls.s16 q9, q4, q13 \n" // B + "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned + "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U + "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V + MEMACCESS(2) + "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. + MEMACCESS(3) + "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. + "bgt 1b \n" + : "+r"(src_argb4444), // %0 + "+r"(src_stride_argb4444), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(pix) // %4 + : + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} +#endif // HAS_ARGB4444TOUVROW_NEON + +#ifdef HAS_RGB565TOYROW_NEON +void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int pix) { + asm volatile ( + "vmov.u8 d24, #13 \n" // B * 0.1016 coefficient + "vmov.u8 d25, #65 \n" // G * 0.5078 coefficient + "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient + "vmov.u8 d27, #16 \n" // Add 16 constant + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels. + "subs %2, %2, #8 \n" // 8 processed per loop. + RGB565TOARGB + "vmull.u8 q2, d0, d24 \n" // B + "vmlal.u8 q2, d1, d25 \n" // G + "vmlal.u8 q2, d2, d26 \n" // R + "vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y + "vqadd.u8 d0, d27 \n" + MEMACCESS(1) + "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. + "bgt 1b \n" + : "+r"(src_rgb565), // %0 + "+r"(dst_y), // %1 + "+r"(pix) // %2 + : + : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13" + ); +} +#endif // HAS_RGB565TOYROW_NEON + +#ifdef HAS_ARGB1555TOYROW_NEON +void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int pix) { + asm volatile ( + "vmov.u8 d24, #13 \n" // B * 0.1016 coefficient + "vmov.u8 d25, #65 \n" // G * 0.5078 coefficient + "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient + "vmov.u8 d27, #16 \n" // Add 16 constant + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels. + "subs %2, %2, #8 \n" // 8 processed per loop. + ARGB1555TOARGB + "vmull.u8 q2, d0, d24 \n" // B + "vmlal.u8 q2, d1, d25 \n" // G + "vmlal.u8 q2, d2, d26 \n" // R + "vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y + "vqadd.u8 d0, d27 \n" + MEMACCESS(1) + "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. + "bgt 1b \n" + : "+r"(src_argb1555), // %0 + "+r"(dst_y), // %1 + "+r"(pix) // %2 + : + : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13" + ); +} +#endif // HAS_ARGB1555TOYROW_NEON + +#ifdef HAS_ARGB4444TOYROW_NEON +void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int pix) { + asm volatile ( + "vmov.u8 d24, #13 \n" // B * 0.1016 coefficient + "vmov.u8 d25, #65 \n" // G * 0.5078 coefficient + "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient + "vmov.u8 d27, #16 \n" // Add 16 constant + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels. + "subs %2, %2, #8 \n" // 8 processed per loop. + ARGB4444TOARGB + "vmull.u8 q2, d0, d24 \n" // B + "vmlal.u8 q2, d1, d25 \n" // G + "vmlal.u8 q2, d2, d26 \n" // R + "vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y + "vqadd.u8 d0, d27 \n" + MEMACCESS(1) + "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. + "bgt 1b \n" + : "+r"(src_argb4444), // %0 + "+r"(dst_y), // %1 + "+r"(pix) // %2 + : + : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13" + ); +} +#endif // HAS_ARGB4444TOYROW_NEON + +#ifdef HAS_BGRATOYROW_NEON +void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int pix) { + asm volatile ( + "movi v4.8b, #33 \n" // R * 0.2578 coefficient + "movi v5.8b, #65 \n" // G * 0.5078 coefficient + "movi v6.8b, #13 \n" // B * 0.1016 coefficient + "movi v7.8b, #16 \n" // Add 16 constant + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 8 pixels of BGRA. + "subs %2, %2, #8 \n" // 8 processed per loop. + "umull v16.8h, v1.8b, v4.8b \n" // R + "umlal v16.8h, v2.8b, v5.8b \n" // G + "umlal v16.8h, v3.8b, v6.8b \n" // B + "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y + "uqadd v0.8b, v0.8b, v7.8b \n" + MEMACCESS(1) + "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. + "bgt 1b \n" + : "+r"(src_bgra), // %0 + "+r"(dst_y), // %1 + "+r"(pix) // %2 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16" + ); +} +#endif // HAS_BGRATOYROW_NEON + +#ifdef HAS_ABGRTOYROW_NEON +void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int pix) { + asm volatile ( + "movi v4.8b, #33 \n" // R * 0.2578 coefficient + "movi v5.8b, #65 \n" // G * 0.5078 coefficient + "movi v6.8b, #13 \n" // B * 0.1016 coefficient + "movi v7.8b, #16 \n" // Add 16 constant + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 8 pixels of ABGR. + "subs %2, %2, #8 \n" // 8 processed per loop. + "umull v16.8h, v0.8b, v4.8b \n" // R + "umlal v16.8h, v1.8b, v5.8b \n" // G + "umlal v16.8h, v2.8b, v6.8b \n" // B + "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y + "uqadd v0.8b, v0.8b, v7.8b \n" + MEMACCESS(1) + "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. + "bgt 1b \n" + : "+r"(src_abgr), // %0 + "+r"(dst_y), // %1 + "+r"(pix) // %2 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16" + ); +} +#endif // HAS_ABGRTOYROW_NEON + +#ifdef HAS_RGBATOYROW_NEON +void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int pix) { + asm volatile ( + "movi v4.8b, #13 \n" // B * 0.1016 coefficient + "movi v5.8b, #65 \n" // G * 0.5078 coefficient + "movi v6.8b, #33 \n" // R * 0.2578 coefficient + "movi v7.8b, #16 \n" // Add 16 constant + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 8 pixels of RGBA. + "subs %2, %2, #8 \n" // 8 processed per loop. + "umull v16.8h, v1.8b, v4.8b \n" // B + "umlal v16.8h, v2.8b, v5.8b \n" // G + "umlal v16.8h, v3.8b, v6.8b \n" // R + "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y + "uqadd v0.8b, v0.8b, v7.8b \n" + MEMACCESS(1) + "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. + "bgt 1b \n" + : "+r"(src_rgba), // %0 + "+r"(dst_y), // %1 + "+r"(pix) // %2 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16" + ); +} +#endif // HAS_RGBATOYROW_NEON + +#ifdef HAS_RGB24TOYROW_NEON +void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int pix) { + asm volatile ( + "movi v4.8b, #13 \n" // B * 0.1016 coefficient + "movi v5.8b, #65 \n" // G * 0.5078 coefficient + "movi v6.8b, #33 \n" // R * 0.2578 coefficient + "movi v7.8b, #16 \n" // Add 16 constant + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "ld3 {v0.8b-v2.8b}, [%0], #24 \n" // load 8 pixels of RGB24. + "subs %2, %2, #8 \n" // 8 processed per loop. + "umull v16.8h, v0.8b, v4.8b \n" // B + "umlal v16.8h, v1.8b, v5.8b \n" // G + "umlal v16.8h, v2.8b, v6.8b \n" // R + "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y + "uqadd v0.8b, v0.8b, v7.8b \n" + MEMACCESS(1) + "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. + "bgt 1b \n" + : "+r"(src_rgb24), // %0 + "+r"(dst_y), // %1 + "+r"(pix) // %2 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16" + ); +} +#endif // HAS_RGB24TOYROW_NEON + +#ifdef HAS_RAWTOYROW_NEON +void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int pix) { + asm volatile ( + "movi v4.8b, #33 \n" // R * 0.2578 coefficient + "movi v5.8b, #65 \n" // G * 0.5078 coefficient + "movi v6.8b, #13 \n" // B * 0.1016 coefficient + "movi v7.8b, #16 \n" // Add 16 constant + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "ld3 {v0.8b-v2.8b}, [%0], #24 \n" // load 8 pixels of RAW. + "subs %2, %2, #8 \n" // 8 processed per loop. + "umull v16.8h, v0.8b, v4.8b \n" // B + "umlal v16.8h, v1.8b, v5.8b \n" // G + "umlal v16.8h, v2.8b, v6.8b \n" // R + "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y + "uqadd v0.8b, v0.8b, v7.8b \n" + MEMACCESS(1) + "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. + "bgt 1b \n" + : "+r"(src_raw), // %0 + "+r"(dst_y), // %1 + "+r"(pix) // %2 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16" + ); +} +#endif // HAS_RAWTOYROW_NEON + +// Bilinear filter 16x2 -> 16x1 +#ifdef HAS_INTERPOLATEROW_NEON +void InterpolateRow_NEON(uint8* dst_ptr, + const uint8* src_ptr, ptrdiff_t src_stride, + int dst_width, int source_y_fraction) { + int y1_fraction = source_y_fraction; + int y0_fraction = 256 - y1_fraction; + const uint8* src_ptr1 = src_ptr + src_stride; + asm volatile ( + "cmp %4, #0 \n" + "beq 100f \n" + "cmp %4, #64 \n" + "beq 75f \n" + "cmp %4, #128 \n" + "beq 50f \n" + "cmp %4, #192 \n" + "beq 25f \n" + + "dup v5.16b, %w4 \n" + "dup v4.16b, %w5 \n" + // General purpose row blend. + "1: \n" + MEMACCESS(1) + "ld1 {v0.16b}, [%1], #16 \n" + MEMACCESS(2) + "ld1 {v1.16b}, [%2], #16 \n" + "subs %3, %3, #16 \n" + "umull v2.8h, v0.8b, v4.8b \n" + "umull2 v3.8h, v0.16b, v4.16b \n" + "umlal v2.8h, v1.8b, v5.8b \n" + "umlal2 v3.8h, v1.16b, v5.16b \n" + "rshrn v0.8b, v2.8h, #8 \n" + "rshrn2 v0.16b, v3.8h, #8 \n" + MEMACCESS(0) + "st1 {v0.16b}, [%0], #16 \n" + "bgt 1b \n" + "b 99f \n" + + // Blend 25 / 75. + "25: \n" + MEMACCESS(1) + "ld1 {v0.16b}, [%1], #16 \n" + MEMACCESS(2) + "ld1 {v1.16b}, [%2], #16 \n" + "subs %3, %3, #16 \n" + "urhadd v0.16b, v0.16b, v1.16b \n" + "urhadd v0.16b, v0.16b, v1.16b \n" + MEMACCESS(0) + "st1 {v0.16b}, [%0], #16 \n" + "bgt 25b \n" + "b 99f \n" + + // Blend 50 / 50. + "50: \n" + MEMACCESS(1) + "ld1 {v0.16b}, [%1], #16 \n" + MEMACCESS(2) + "ld1 {v1.16b}, [%2], #16 \n" + "subs %3, %3, #16 \n" + "urhadd v0.16b, v0.16b, v1.16b \n" + MEMACCESS(0) + "st1 {v0.16b}, [%0], #16 \n" + "bgt 50b \n" + "b 99f \n" + + // Blend 75 / 25. + "75: \n" + MEMACCESS(1) + "ld1 {v1.16b}, [%1], #16 \n" + MEMACCESS(2) + "ld1 {v0.16b}, [%2], #16 \n" + "subs %3, %3, #16 \n" + "urhadd v0.16b, v0.16b, v1.16b \n" + "urhadd v0.16b, v0.16b, v1.16b \n" + MEMACCESS(0) + "st1 {v0.16b}, [%0], #16 \n" + "bgt 75b \n" + "b 99f \n" + + // Blend 100 / 0 - Copy row unchanged. + "100: \n" + MEMACCESS(1) + "ld1 {v0.16b}, [%1], #16 \n" + "subs %3, %3, #16 \n" + MEMACCESS(0) + "st1 {v0.16b}, [%0], #16 \n" + "bgt 100b \n" + + "99: \n" + : "+r"(dst_ptr), // %0 + "+r"(src_ptr), // %1 + "+r"(src_ptr1), // %2 + "+r"(dst_width), // %3 + "+r"(y1_fraction), // %4 + "+r"(y0_fraction) // %5 + : + : "cc", "memory", "v0", "v1", "v3", "v4", "v5" + ); +} +#endif // HAS_INTERPOLATEROW_NEON + +// dr * (256 - sa) / 256 + sr = dr - dr * sa / 256 + sr +#ifdef HAS_ARGBBLENDROW_NEON +void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1, + uint8* dst_argb, int width) { + asm volatile ( + "subs %3, %3, #8 \n" + "blt 89f \n" + // Blend 8 pixels. + "8: \n" + MEMACCESS(0) + "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 8 pixels of ARGB0. + MEMACCESS(1) + "ld4 {v4.8b-v7.8b}, [%1], #32 \n" // load 8 pixels of ARGB1. + "subs %3, %3, #8 \n" // 8 processed per loop. + "umull v16.8h, v4.8b, v3.8b \n" // db * a + "umull v17.8h, v5.8b, v3.8b \n" // dg * a + "umull v18.8h, v6.8b, v3.8b \n" // dr * a + "uqrshrn v16.8b, v16.8h, #8 \n" // db >>= 8 + "uqrshrn v17.8b, v17.8h, #8 \n" // dg >>= 8 + "uqrshrn v18.8b, v18.8h, #8 \n" // dr >>= 8 + "uqsub v4.8b, v4.8b, v16.8b \n" // db - (db * a / 256) + "uqsub v5.8b, v5.8b, v17.8b \n" // dg - (dg * a / 256) + "uqsub v6.8b, v6.8b, v18.8b \n" // dr - (dr * a / 256) + "uqadd v0.8b, v0.8b, v4.8b \n" // + sb + "uqadd v1.8b, v1.8b, v5.8b \n" // + sg + "uqadd v2.8b, v2.8b, v6.8b \n" // + sr + "movi v3.8b, #255 \n" // a = 255 + MEMACCESS(2) + "st4 {v0.8b-v3.8b}, [%2], #32 \n" // store 8 pixels of ARGB. + "bge 8b \n" + + "89: \n" + "adds %3, %3, #8-1 \n" + "blt 99f \n" + + // Blend 1 pixels. + "1: \n" + MEMACCESS(0) + "ld4 {v0.b-v3.b}[0], [%0], #4 \n" // load 1 pixel ARGB0. + MEMACCESS(1) + "ld4 {v4.b-v7.b}[0], [%1], #4 \n" // load 1 pixel ARGB1. + "subs %3, %3, #1 \n" // 1 processed per loop. + "umull v16.8h, v4.8b, v3.8b \n" // db * a + "umull v17.8h, v5.8b, v3.8b \n" // dg * a + "umull v18.8h, v6.8b, v3.8b \n" // dr * a + "uqrshrn v16.8b, v16.8h, #8 \n" // db >>= 8 + "uqrshrn v17.8b, v17.8h, #8 \n" // dg >>= 8 + "uqrshrn v18.8b, v18.8h, #8 \n" // dr >>= 8 + "uqsub v4.8b, v4.8b, v16.8b \n" // db - (db * a / 256) + "uqsub v5.8b, v5.8b, v17.8b \n" // dg - (dg * a / 256) + "uqsub v6.8b, v6.8b, v18.8b \n" // dr - (dr * a / 256) + "uqadd v0.8b, v0.8b, v4.8b \n" // + sb + "uqadd v1.8b, v1.8b, v5.8b \n" // + sg + "uqadd v2.8b, v2.8b, v6.8b \n" // + sr + "movi v3.8b, #255 \n" // a = 255 + MEMACCESS(2) + "st4 {v0.b-v3.b}[0], [%2], #4 \n" // store 1 pixel. + "bge 1b \n" + + "99: \n" + + : "+r"(src_argb0), // %0 + "+r"(src_argb1), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v16", "v17", "v18" + ); +} +#endif // HAS_ARGBBLENDROW_NEON + +// Attenuate 8 pixels at a time. +#ifdef HAS_ARGBATTENUATEROW_NEON +void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) { + asm volatile ( + // Attenuate 8 pixels. + "1: \n" + MEMACCESS(0) + "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 8 pixels of ARGB. + "subs %2, %2, #8 \n" // 8 processed per loop. + "umull v4.8h, v0.8b, v3.8b \n" // b * a + "umull v5.8h, v1.8b, v3.8b \n" // g * a + "umull v6.8h, v2.8b, v3.8b \n" // r * a + "uqrshrn v0.8b, v4.8h, #8 \n" // b >>= 8 + "uqrshrn v1.8b, v5.8h, #8 \n" // g >>= 8 + "uqrshrn v2.8b, v6.8h, #8 \n" // r >>= 8 + MEMACCESS(1) + "st4 {v0.8b-v3.8b}, [%1], #32 \n" // store 8 pixels of ARGB. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6" + ); +} +#endif // HAS_ARGBATTENUATEROW_NEON + +// Quantize 8 ARGB pixels (32 bytes). +// dst = (dst * scale >> 16) * interval_size + interval_offset; +#ifdef HAS_ARGBQUANTIZEROW_NEON +void ARGBQuantizeRow_NEON(uint8* dst_argb, int scale, int interval_size, + int interval_offset, int width) { + asm volatile ( + "dup v4.8h, %w2 \n" + "ushr v4.8h, v4.8h, #1 \n" // scale >>= 1 + "dup v5.8h, %w3 \n" // interval multiply. + "dup v6.8h, %w4 \n" // interval add + + // 8 pixel loop. + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "ld4 {v0.8b-v3.8b}, [%0] \n" // load 8 pixels of ARGB. + "subs %1, %1, #8 \n" // 8 processed per loop. + "uxtl v0.8h, v0.8b \n" // b (0 .. 255) + "uxtl v1.8h, v1.8b \n" + "uxtl v2.8h, v2.8b \n" + "sqdmulh v0.8h, v0.8h, v4.8h \n" // b * scale + "sqdmulh v1.8h, v1.8h, v4.8h \n" // g + "sqdmulh v2.8h, v2.8h, v4.8h \n" // r + "mul v0.8h, v0.8h, v5.8h \n" // b * interval_size + "mul v1.8h, v1.8h, v5.8h \n" // g + "mul v2.8h, v2.8h, v5.8h \n" // r + "add v0.8h, v0.8h, v6.8h \n" // b + interval_offset + "add v1.8h, v1.8h, v6.8h \n" // g + "add v2.8h, v2.8h, v6.8h \n" // r + "uqxtn v0.8b, v0.8h \n" + "uqxtn v1.8b, v1.8h \n" + "uqxtn v2.8b, v2.8h \n" + MEMACCESS(0) + "st4 {v0.8b-v3.8b}, [%0], #32 \n" // store 8 pixels of ARGB. + "bgt 1b \n" + : "+r"(dst_argb), // %0 + "+r"(width) // %1 + : "r"(scale), // %2 + "r"(interval_size), // %3 + "r"(interval_offset) // %4 + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6" + ); +} +#endif // HAS_ARGBQUANTIZEROW_NEON + +// Shade 8 pixels at a time by specified value. +// NOTE vqrdmulh.s16 q10, q10, d0[0] must use a scaler register from 0 to 8. +// Rounding in vqrdmulh does +1 to high if high bit of low s16 is set. +#ifdef HAS_ARGBSHADEROW_NEON +void ARGBShadeRow_NEON(const uint8* src_argb, uint8* dst_argb, int width, + uint32 value) { + asm volatile ( + "dup v0.4s, %w3 \n" // duplicate scale value. + "zip1 v0.8b, v0.8b, v0.8b \n" // v0.8b aarrggbb. + "ushr v0.8h, v0.8h, #1 \n" // scale / 2. + + // 8 pixel loop. + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "ld4 {v4.8b-v7.8b}, [%0], #32 \n" // load 8 pixels of ARGB. + "subs %2, %2, #8 \n" // 8 processed per loop. + "uxtl v4.8h, v4.8b \n" // b (0 .. 255) + "uxtl v5.8h, v5.8b \n" + "uxtl v6.8h, v6.8b \n" + "uxtl v7.8h, v7.8b \n" + "sqrdmulh v4.8h, v4.8h, v0.h[0] \n" // b * scale * 2 + "sqrdmulh v5.8h, v5.8h, v0.h[1] \n" // g + "sqrdmulh v6.8h, v6.8h, v0.h[2] \n" // r + "sqrdmulh v7.8h, v7.8h, v0.h[3] \n" // a + "uqxtn v4.8b, v4.8h \n" + "uqxtn v5.8b, v5.8h \n" + "uqxtn v6.8b, v6.8h \n" + "uqxtn v7.8b, v7.8h \n" + MEMACCESS(1) + "st4 {v4.8b-v7.8b}, [%1], #32 \n" // store 8 pixels of ARGB. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "r"(value) // %3 + : "cc", "memory", "v0", "v4", "v5", "v6", "v7" + ); +} +#endif // HAS_ARGBSHADEROW_NEON + +// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels +// Similar to ARGBToYJ but stores ARGB. +// C code is (15 * b + 75 * g + 38 * r + 64) >> 7; +#ifdef HAS_ARGBGRAYROW_NEON +void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) { + asm volatile ( + "movi v24.8b, #15 \n" // B * 0.11400 coefficient + "movi v25.8b, #75 \n" // G * 0.58700 coefficient + "movi v26.8b, #38 \n" // R * 0.29900 coefficient + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. + "subs %2, %2, #8 \n" // 8 processed per loop. + "umull v4.8h, v0.8b, v24.8b \n" // B + "umlal v4.8h, v1.8b, v25.8b \n" // G + "umlal v4.8h, v2.8b, v26.8b \n" // R + "sqrshrun v0.8b, v4.8h, #7 \n" // 15 bit to 8 bit B + "mov v1.8b, v0.8b \n" // G + "mov v2.8b, v0.8b \n" // R + MEMACCESS(1) + "st4 {v0.8b-v3.8b}, [%1], #32 \n" // store 8 ARGB pixels. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v24", "v25", "v26" + ); +} +#endif // HAS_ARGBGRAYROW_NEON + +// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels. +// b = (r * 35 + g * 68 + b * 17) >> 7 +// g = (r * 45 + g * 88 + b * 22) >> 7 +// r = (r * 50 + g * 98 + b * 24) >> 7 + +#ifdef HAS_ARGBSEPIAROW_NEON +void ARGBSepiaRow_NEON(uint8* dst_argb, int width) { + asm volatile ( + "movi v20.8b, #17 \n" // BB coefficient + "movi v21.8b, #68 \n" // BG coefficient + "movi v22.8b, #35 \n" // BR coefficient + "movi v24.8b, #22 \n" // GB coefficient + "movi v25.8b, #88 \n" // GG coefficient + "movi v26.8b, #45 \n" // GR coefficient + "movi v28.8b, #24 \n" // BB coefficient + "movi v29.8b, #98 \n" // BG coefficient + "movi v30.8b, #50 \n" // BR coefficient + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "ld4 {v0.8b-v3.8b}, [%0] \n" // load 8 ARGB pixels. + "subs %1, %1, #8 \n" // 8 processed per loop. + "umull v4.8h, v0.8b, v20.8b \n" // B to Sepia B + "umlal v4.8h, v1.8b, v21.8b \n" // G + "umlal v4.8h, v2.8b, v22.8b \n" // R + "umull v5.8h, v0.8b, v24.8b \n" // B to Sepia G + "umlal v5.8h, v1.8b, v25.8b \n" // G + "umlal v5.8h, v2.8b, v26.8b \n" // R + "umull v6.8h, v0.8b, v28.8b \n" // B to Sepia R + "umlal v6.8h, v1.8b, v29.8b \n" // G + "umlal v6.8h, v2.8b, v30.8b \n" // R + "uqshrn v0.8b, v4.8h, #7 \n" // 16 bit to 8 bit B + "uqshrn v1.8b, v5.8h, #7 \n" // 16 bit to 8 bit G + "uqshrn v2.8b, v6.8h, #7 \n" // 16 bit to 8 bit R + MEMACCESS(0) + "st4 {v0.8b-v3.8b}, [%0], #32 \n" // store 8 ARGB pixels. + "bgt 1b \n" + : "+r"(dst_argb), // %0 + "+r"(width) // %1 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v20", "v21", "v22", "v24", "v25", "v26", "v28", "v29", "v30" + ); +} +#endif // HAS_ARGBSEPIAROW_NEON + +// Tranform 8 ARGB pixels (32 bytes) with color matrix. +// TODO(fbarchard): Was same as Sepia except matrix is provided. This function +// needs to saturate. Consider doing a non-saturating version. +#ifdef HAS_ARGBCOLORMATRIXROW_NEON +void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb, + const int8* matrix_argb, int width) { + asm volatile ( + MEMACCESS(3) + "ld1 {v2.16b}, [%3] \n" // load 3 ARGB vectors. + "sxtl v0.8h, v2.8b \n" // B,G coefficients s16. + "sxtl2 v1.8h, v2.16b \n" // R,A coefficients s16. + + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "ld4 {v16.8b-v19.8b}, [%0], #32 \n" // load 8 ARGB pixels. + "subs %2, %2, #8 \n" // 8 processed per loop. + "uxtl v16.8h, v16.8b \n" // b (0 .. 255) 16 bit + "uxtl v17.8h, v17.8b \n" // g + "uxtl v18.8h, v18.8b \n" // r + "uxtl v19.8h, v19.8b \n" // a + "mul v22.8h, v16.8h, v0.h[0] \n" // B = B * Matrix B + "mul v23.8h, v16.8h, v0.h[4] \n" // G = B * Matrix G + "mul v24.8h, v16.8h, v1.h[0] \n" // R = B * Matrix R + "mul v25.8h, v16.8h, v1.h[4] \n" // A = B * Matrix A + "mul v4.8h, v17.8h, v0.h[1] \n" // B += G * Matrix B + "mul v5.8h, v17.8h, v0.h[5] \n" // G += G * Matrix G + "mul v6.8h, v17.8h, v1.h[1] \n" // R += G * Matrix R + "mul v7.8h, v17.8h, v1.h[5] \n" // A += G * Matrix A + "sqadd v22.8h, v22.8h, v4.8h \n" // Accumulate B + "sqadd v23.8h, v23.8h, v5.8h \n" // Accumulate G + "sqadd v24.8h, v24.8h, v6.8h \n" // Accumulate R + "sqadd v25.8h, v25.8h, v7.8h \n" // Accumulate A + "mul v4.8h, v18.8h, v0.h[2] \n" // B += R * Matrix B + "mul v5.8h, v18.8h, v0.h[6] \n" // G += R * Matrix G + "mul v6.8h, v18.8h, v1.h[2] \n" // R += R * Matrix R + "mul v7.8h, v18.8h, v1.h[6] \n" // A += R * Matrix A + "sqadd v22.8h, v22.8h, v4.8h \n" // Accumulate B + "sqadd v23.8h, v23.8h, v5.8h \n" // Accumulate G + "sqadd v24.8h, v24.8h, v6.8h \n" // Accumulate R + "sqadd v25.8h, v25.8h, v7.8h \n" // Accumulate A + "mul v4.8h, v19.8h, v0.h[3] \n" // B += A * Matrix B + "mul v5.8h, v19.8h, v0.h[7] \n" // G += A * Matrix G + "mul v6.8h, v19.8h, v1.h[3] \n" // R += A * Matrix R + "mul v7.8h, v19.8h, v1.h[7] \n" // A += A * Matrix A + "sqadd v22.8h, v22.8h, v4.8h \n" // Accumulate B + "sqadd v23.8h, v23.8h, v5.8h \n" // Accumulate G + "sqadd v24.8h, v24.8h, v6.8h \n" // Accumulate R + "sqadd v25.8h, v25.8h, v7.8h \n" // Accumulate A + "sqshrun v16.8b, v22.8h, #6 \n" // 16 bit to 8 bit B + "sqshrun v17.8b, v23.8h, #6 \n" // 16 bit to 8 bit G + "sqshrun v18.8b, v24.8h, #6 \n" // 16 bit to 8 bit R + "sqshrun v19.8b, v25.8h, #6 \n" // 16 bit to 8 bit A + MEMACCESS(1) + "st4 {v16.8b-v19.8b}, [%1], #32 \n" // store 8 ARGB pixels. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "r"(matrix_argb) // %3 + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", + "v18", "v19", "v22", "v23", "v24", "v25" + ); +} +#endif // HAS_ARGBCOLORMATRIXROW_NEON + +// TODO(fbarchard): fix vqshrun in ARGBMultiplyRow_NEON and reenable. +// Multiply 2 rows of ARGB pixels together, 8 pixels at a time. +#ifdef HAS_ARGBMULTIPLYROW_NEON +void ARGBMultiplyRow_NEON(const uint8* src_argb0, const uint8* src_argb1, + uint8* dst_argb, int width) { + asm volatile ( + // 8 pixel loop. + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. + MEMACCESS(1) + "ld4 {v4.8b-v7.8b}, [%1], #32 \n" // load 8 more ARGB pixels. + "subs %3, %3, #8 \n" // 8 processed per loop. + "umull v0.8h, v0.8b, v4.8b \n" // multiply B + "umull v1.8h, v1.8b, v5.8b \n" // multiply G + "umull v2.8h, v2.8b, v6.8b \n" // multiply R + "umull v3.8h, v3.8b, v7.8b \n" // multiply A + "rshrn v0.8b, v0.8h, #8 \n" // 16 bit to 8 bit B + "rshrn v1.8b, v1.8h, #8 \n" // 16 bit to 8 bit G + "rshrn v2.8b, v2.8h, #8 \n" // 16 bit to 8 bit R + "rshrn v3.8b, v3.8h, #8 \n" // 16 bit to 8 bit A + MEMACCESS(2) + "st4 {v0.8b-v3.8b}, [%2], #32 \n" // store 8 ARGB pixels. + "bgt 1b \n" + + : "+r"(src_argb0), // %0 + "+r"(src_argb1), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" + ); +} +#endif // HAS_ARGBMULTIPLYROW_NEON + +// Add 2 rows of ARGB pixels together, 8 pixels at a time. +#ifdef HAS_ARGBADDROW_NEON +void ARGBAddRow_NEON(const uint8* src_argb0, const uint8* src_argb1, + uint8* dst_argb, int width) { + asm volatile ( + // 8 pixel loop. + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. + MEMACCESS(1) + "ld4 {v4.8b-v7.8b}, [%1], #32 \n" // load 8 more ARGB pixels. + "subs %3, %3, #8 \n" // 8 processed per loop. + "uqadd v0.8b, v0.8b, v4.8b \n" + "uqadd v1.8b, v1.8b, v5.8b \n" + "uqadd v2.8b, v2.8b, v6.8b \n" + "uqadd v3.8b, v3.8b, v7.8b \n" + MEMACCESS(2) + "st4 {v0.8b-v3.8b}, [%2], #32 \n" // store 8 ARGB pixels. + "bgt 1b \n" + + : "+r"(src_argb0), // %0 + "+r"(src_argb1), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" + ); +} +#endif // HAS_ARGBADDROW_NEON + +// Subtract 2 rows of ARGB pixels, 8 pixels at a time. +#ifdef HAS_ARGBSUBTRACTROW_NEON +void ARGBSubtractRow_NEON(const uint8* src_argb0, const uint8* src_argb1, + uint8* dst_argb, int width) { + asm volatile ( + // 8 pixel loop. + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. + MEMACCESS(1) + "ld4 {v4.8b-v7.8b}, [%1], #32 \n" // load 8 more ARGB pixels. + "subs %3, %3, #8 \n" // 8 processed per loop. + "uqsub v0.8b, v0.8b, v4.8b \n" + "uqsub v1.8b, v1.8b, v5.8b \n" + "uqsub v2.8b, v2.8b, v6.8b \n" + "uqsub v3.8b, v3.8b, v7.8b \n" + MEMACCESS(2) + "st4 {v0.8b-v3.8b}, [%2], #32 \n" // store 8 ARGB pixels. + "bgt 1b \n" + + : "+r"(src_argb0), // %0 + "+r"(src_argb1), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" + ); +} +#endif // HAS_ARGBSUBTRACTROW_NEON + +// Adds Sobel X and Sobel Y and stores Sobel into ARGB. +// A = 255 +// R = Sobel +// G = Sobel +// B = Sobel +#ifdef HAS_SOBELROW_NEON +void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, + uint8* dst_argb, int width) { + asm volatile ( + "movi v3.8b, #255 \n" // alpha + // 8 pixel loop. + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "ld1 {v0.8b}, [%0], #8 \n" // load 8 sobelx. + MEMACCESS(1) + "ld1 {v1.8b}, [%1], #8 \n" // load 8 sobely. + "subs %3, %3, #8 \n" // 8 processed per loop. + "uqadd v0.8b, v0.8b, v1.8b \n" // add + "mov v1.8b, v0.8b \n" + "mov v2.8b, v0.8b \n" + MEMACCESS(2) + "st4 {v0.8b-v3.8b}, [%2], #32 \n" // store 8 ARGB pixels. + "bgt 1b \n" + : "+r"(src_sobelx), // %0 + "+r"(src_sobely), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "v0", "v1", "v2", "v3" + ); +} +#endif // HAS_SOBELROW_NEON + +// Adds Sobel X and Sobel Y and stores Sobel into plane. +#ifdef HAS_SOBELTOPLANEROW_NEON +void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, + uint8* dst_y, int width) { + asm volatile ( + // 16 pixel loop. + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "ld1 {v0.16b}, [%0], #16 \n" // load 16 sobelx. + MEMACCESS(1) + "ld1 {v1.16b}, [%1], #16 \n" // load 16 sobely. + "subs %3, %3, #16 \n" // 16 processed per loop. + "uqadd v0.16b, v0.16b, v1.16b \n" // add + MEMACCESS(2) + "st1 {v0.16b}, [%2], #16 \n" // store 16 pixels. + "bgt 1b \n" + : "+r"(src_sobelx), // %0 + "+r"(src_sobely), // %1 + "+r"(dst_y), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "v0", "v1" + ); +} +#endif // HAS_SOBELTOPLANEROW_NEON + +// Mixes Sobel X, Sobel Y and Sobel into ARGB. +// A = 255 +// R = Sobel X +// G = Sobel +// B = Sobel Y +#ifdef HAS_SOBELXYROW_NEON +void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, + uint8* dst_argb, int width) { + asm volatile ( + "movi v3.8b, #255 \n" // alpha + // 8 pixel loop. + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "ld1 {v2.8b}, [%0], #8 \n" // load 8 sobelx. + MEMACCESS(1) + "ld1 {v0.8b}, [%1], #8 \n" // load 8 sobely. + "subs %3, %3, #8 \n" // 8 processed per loop. + "uqadd v1.8b, v0.8b, v2.8b \n" // add + MEMACCESS(2) + "st4 {v0.8b-v3.8b}, [%2], #32 \n" // store 8 ARGB pixels. + "bgt 1b \n" + : "+r"(src_sobelx), // %0 + "+r"(src_sobely), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "v0", "v1", "v2", "v3" + ); +} +#endif // HAS_SOBELXYROW_NEON + +// SobelX as a matrix is +// -1 0 1 +// -2 0 2 +// -1 0 1 +#ifdef HAS_SOBELXROW_NEON +void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1, + const uint8* src_y2, uint8* dst_sobelx, int width) { + asm volatile ( + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "ld1 {v0.8b}, [%0],%5 \n" // top + MEMACCESS(0) + "ld1 {v1.8b}, [%0],%6 \n" + "usubl v0.8h, v0.8b, v1.8b \n" + MEMACCESS(1) + "ld1 {v2.8b}, [%1],%5 \n" // center * 2 + MEMACCESS(1) + "ld1 {v3.8b}, [%1],%6 \n" + "usubl v1.8h, v2.8b, v3.8b \n" + "add v0.8h, v0.8h, v1.8h \n" + "add v0.8h, v0.8h, v1.8h \n" + MEMACCESS(2) + "ld1 {v2.8b}, [%2],%5 \n" // bottom + MEMACCESS(2) + "ld1 {v3.8b}, [%2],%6 \n" + "subs %4, %4, #8 \n" // 8 pixels + "usubl v1.8h, v2.8b, v3.8b \n" + "add v0.8h, v0.8h, v1.8h \n" + "abs v0.8h, v0.8h \n" + "uqxtn v0.8b, v0.8h \n" + MEMACCESS(3) + "st1 {v0.8b}, [%3], #8 \n" // store 8 sobelx + "bgt 1b \n" + : "+r"(src_y0), // %0 + "+r"(src_y1), // %1 + "+r"(src_y2), // %2 + "+r"(dst_sobelx), // %3 + "+r"(width) // %4 + : "r"(2), // %5 + "r"(6) // %6 + : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List + ); +} +#endif // HAS_SOBELXROW_NEON + +// SobelY as a matrix is +// -1 -2 -1 +// 0 0 0 +// 1 2 1 +#ifdef HAS_SOBELYROW_NEON +void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1, + uint8* dst_sobely, int width) { + asm volatile ( + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "ld1 {v0.8b}, [%0],%4 \n" // left + MEMACCESS(1) + "ld1 {v1.8b}, [%1],%4 \n" + "usubl v0.8h, v0.8b, v1.8b \n" + MEMACCESS(0) + "ld1 {v2.8b}, [%0],%4 \n" // center * 2 + MEMACCESS(1) + "ld1 {v3.8b}, [%1],%4 \n" + "usubl v1.8h, v2.8b, v3.8b \n" + "add v0.8h, v0.8h, v1.8h \n" + "add v0.8h, v0.8h, v1.8h \n" + MEMACCESS(0) + "ld1 {v2.8b}, [%0],%5 \n" // right + MEMACCESS(1) + "ld1 {v3.8b}, [%1],%5 \n" + "subs %3, %3, #8 \n" // 8 pixels + "usubl v1.8h, v2.8b, v3.8b \n" + "add v0.8h, v0.8h, v1.8h \n" + "abs v0.8h, v0.8h \n" + "uqxtn v0.8b, v0.8h \n" + MEMACCESS(2) + "st1 {v0.8b}, [%2], #8 \n" // store 8 sobely + "bgt 1b \n" + : "+r"(src_y0), // %0 + "+r"(src_y1), // %1 + "+r"(dst_sobely), // %2 + "+r"(width) // %3 + : "r"(1), // %4 + "r"(6) // %5 + : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List + ); +} +#endif // HAS_SOBELYROW_NEON +#endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/TMessagesProj/jni/libyuv/source/row_posix.cc b/TMessagesProj/jni/libyuv/source/row_posix.cc new file mode 100644 index 000000000..106fda568 --- /dev/null +++ b/TMessagesProj/jni/libyuv/source/row_posix.cc @@ -0,0 +1,6443 @@ +/* + * Copyright 2011 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/row.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// This module is for GCC x86 and x64. +#if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__)) + +#if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3) + +// Constants for ARGB +static vec8 kARGBToY = { + 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0 +}; + +// JPeg full range. +static vec8 kARGBToYJ = { + 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0 +}; +#endif // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3) + +#if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3) + +static vec8 kARGBToU = { + 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0 +}; + +static vec8 kARGBToUJ = { + 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0 +}; + +static vec8 kARGBToV = { + -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, +}; + +static vec8 kARGBToVJ = { + -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0 +}; + +// Constants for BGRA +static vec8 kBGRAToY = { + 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13 +}; + +static vec8 kBGRAToU = { + 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112 +}; + +static vec8 kBGRAToV = { + 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18 +}; + +// Constants for ABGR +static vec8 kABGRToY = { + 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0 +}; + +static vec8 kABGRToU = { + -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0 +}; + +static vec8 kABGRToV = { + 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0 +}; + +// Constants for RGBA. +static vec8 kRGBAToY = { + 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33 +}; + +static vec8 kRGBAToU = { + 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38 +}; + +static vec8 kRGBAToV = { + 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112 +}; + +static uvec8 kAddY16 = { + 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u +}; + +static vec16 kAddYJ64 = { + 64, 64, 64, 64, 64, 64, 64, 64 +}; + +static uvec8 kAddUV128 = { + 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, + 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u +}; + +static uvec16 kAddUVJ128 = { + 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u +}; +#endif // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3) + +#ifdef HAS_RGB24TOARGBROW_SSSE3 + +// Shuffle table for converting RGB24 to ARGB. +static uvec8 kShuffleMaskRGB24ToARGB = { + 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u +}; + +// Shuffle table for converting RAW to ARGB. +static uvec8 kShuffleMaskRAWToARGB = { + 2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u +}; + +// Shuffle table for converting ARGB to RGB24. +static uvec8 kShuffleMaskARGBToRGB24 = { + 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u +}; + +// Shuffle table for converting ARGB to RAW. +static uvec8 kShuffleMaskARGBToRAW = { + 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u +}; + +// Shuffle table for converting ARGBToRGB24 for I422ToRGB24. First 8 + next 4 +static uvec8 kShuffleMaskARGBToRGB24_0 = { + 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u +}; + +// Shuffle table for converting ARGB to RAW. +static uvec8 kShuffleMaskARGBToRAW_0 = { + 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 128u, 128u, 128u, 128u, 8u, 14u, 13u, 12u +}; +#endif // HAS_RGB24TOARGBROW_SSSE3 + +#if defined(TESTING) && defined(__x86_64__) +void TestRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) { + asm volatile ( + ".p2align 5 \n" + "mov %%eax,%%eax \n" + "mov %%ebx,%%ebx \n" + "mov %%ecx,%%ecx \n" + "mov %%edx,%%edx \n" + "mov %%esi,%%esi \n" + "mov %%edi,%%edi \n" + "mov %%ebp,%%ebp \n" + "mov %%esp,%%esp \n" + ".p2align 5 \n" + "mov %%r8d,%%r8d \n" + "mov %%r9d,%%r9d \n" + "mov %%r10d,%%r10d \n" + "mov %%r11d,%%r11d \n" + "mov %%r12d,%%r12d \n" + "mov %%r13d,%%r13d \n" + "mov %%r14d,%%r14d \n" + "mov %%r15d,%%r15d \n" + ".p2align 5 \n" + "lea (%%rax),%%eax \n" + "lea (%%rbx),%%ebx \n" + "lea (%%rcx),%%ecx \n" + "lea (%%rdx),%%edx \n" + "lea (%%rsi),%%esi \n" + "lea (%%rdi),%%edi \n" + "lea (%%rbp),%%ebp \n" + "lea (%%rsp),%%esp \n" + ".p2align 5 \n" + "lea (%%r8),%%r8d \n" + "lea (%%r9),%%r9d \n" + "lea (%%r10),%%r10d \n" + "lea (%%r11),%%r11d \n" + "lea (%%r12),%%r12d \n" + "lea (%%r13),%%r13d \n" + "lea (%%r14),%%r14d \n" + "lea (%%r15),%%r15d \n" + + ".p2align 5 \n" + "lea 0x10(%%rax),%%eax \n" + "lea 0x10(%%rbx),%%ebx \n" + "lea 0x10(%%rcx),%%ecx \n" + "lea 0x10(%%rdx),%%edx \n" + "lea 0x10(%%rsi),%%esi \n" + "lea 0x10(%%rdi),%%edi \n" + "lea 0x10(%%rbp),%%ebp \n" + "lea 0x10(%%rsp),%%esp \n" + ".p2align 5 \n" + "lea 0x10(%%r8),%%r8d \n" + "lea 0x10(%%r9),%%r9d \n" + "lea 0x10(%%r10),%%r10d \n" + "lea 0x10(%%r11),%%r11d \n" + "lea 0x10(%%r12),%%r12d \n" + "lea 0x10(%%r13),%%r13d \n" + "lea 0x10(%%r14),%%r14d \n" + "lea 0x10(%%r15),%%r15d \n" + + ".p2align 5 \n" + "add 0x10,%%eax \n" + "add 0x10,%%ebx \n" + "add 0x10,%%ecx \n" + "add 0x10,%%edx \n" + "add 0x10,%%esi \n" + "add 0x10,%%edi \n" + "add 0x10,%%ebp \n" + "add 0x10,%%esp \n" + ".p2align 5 \n" + "add 0x10,%%r8d \n" + "add 0x10,%%r9d \n" + "add 0x10,%%r10d \n" + "add 0x10,%%r11d \n" + "add 0x10,%%r12d \n" + "add 0x10,%%r13d \n" + "add 0x10,%%r14d \n" + "add 0x10,%%r15d \n" + + ".p2align 2 \n" + "1: \n" + "movq " MEMACCESS(0) ",%%xmm0 \n" + "lea " MEMLEA(0x8,0) ",%0 \n" + "movdqa %%xmm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x20,1) ",%1 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + : "+r"(src_y), // %0 + "+r"(dst_argb), // %1 + "+r"(pix) // %2 + : + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm5" +#endif + ); +} +#endif // TESTING + +#ifdef HAS_I400TOARGBROW_SSE2 +void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) { + asm volatile ( + "pcmpeqb %%xmm5,%%xmm5 \n" + "pslld $0x18,%%xmm5 \n" + LABELALIGN + "1: \n" + "movq " MEMACCESS(0) ",%%xmm0 \n" + "lea " MEMLEA(0x8,0) ",%0 \n" + "punpcklbw %%xmm0,%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklwd %%xmm0,%%xmm0 \n" + "punpckhwd %%xmm1,%%xmm1 \n" + "por %%xmm5,%%xmm0 \n" + "por %%xmm5,%%xmm1 \n" + "movdqa %%xmm0," MEMACCESS(1) " \n" + "movdqa %%xmm1," MEMACCESS2(0x10,1) " \n" + "lea " MEMLEA(0x20,1) ",%1 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + : "+r"(src_y), // %0 + "+r"(dst_argb), // %1 + "+r"(pix) // %2 + : + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm5" +#endif + ); +} + +void I400ToARGBRow_Unaligned_SSE2(const uint8* src_y, uint8* dst_argb, + int pix) { + asm volatile ( + "pcmpeqb %%xmm5,%%xmm5 \n" + "pslld $0x18,%%xmm5 \n" + LABELALIGN + "1: \n" + "movq " MEMACCESS(0) ",%%xmm0 \n" + "lea " MEMLEA(0x8,0) ",%0 \n" + "punpcklbw %%xmm0,%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklwd %%xmm0,%%xmm0 \n" + "punpckhwd %%xmm1,%%xmm1 \n" + "por %%xmm5,%%xmm0 \n" + "por %%xmm5,%%xmm1 \n" + "movdqu %%xmm0," MEMACCESS(1) " \n" + "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" + "lea " MEMLEA(0x20,1) ",%1 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + : "+r"(src_y), // %0 + "+r"(dst_argb), // %1 + "+r"(pix) // %2 + : + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm5" +#endif + ); +} +#endif // HAS_I400TOARGBROW_SSE2 + +#ifdef HAS_RGB24TOARGBROW_SSSE3 +void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) { + asm volatile ( + "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000 + "pslld $0x18,%%xmm5 \n" + "movdqa %3,%%xmm4 \n" + LABELALIGN + "1: \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqu " MEMACCESS2(0x20,0) ",%%xmm3 \n" + "lea " MEMLEA(0x30,0) ",%0 \n" + "movdqa %%xmm3,%%xmm2 \n" + "palignr $0x8,%%xmm1,%%xmm2 \n" + "pshufb %%xmm4,%%xmm2 \n" + "por %%xmm5,%%xmm2 \n" + "palignr $0xc,%%xmm0,%%xmm1 \n" + "pshufb %%xmm4,%%xmm0 \n" + "movdqa %%xmm2," MEMACCESS2(0x20,1) " \n" + "por %%xmm5,%%xmm0 \n" + "pshufb %%xmm4,%%xmm1 \n" + "movdqa %%xmm0," MEMACCESS(1) " \n" + "por %%xmm5,%%xmm1 \n" + "palignr $0x4,%%xmm3,%%xmm3 \n" + "pshufb %%xmm4,%%xmm3 \n" + "movdqa %%xmm1," MEMACCESS2(0x10,1) " \n" + "por %%xmm5,%%xmm3 \n" + "sub $0x10,%2 \n" + "movdqa %%xmm3," MEMACCESS2(0x30,1) " \n" + "lea " MEMLEA(0x40,1) ",%1 \n" + "jg 1b \n" + : "+r"(src_rgb24), // %0 + "+r"(dst_argb), // %1 + "+r"(pix) // %2 + : "m"(kShuffleMaskRGB24ToARGB) // %3 + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" +#endif + ); +} + +void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix) { + asm volatile ( + "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000 + "pslld $0x18,%%xmm5 \n" + "movdqa %3,%%xmm4 \n" + LABELALIGN + "1: \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqu " MEMACCESS2(0x20,0) ",%%xmm3 \n" + "lea " MEMLEA(0x30,0) ",%0 \n" + "movdqa %%xmm3,%%xmm2 \n" + "palignr $0x8,%%xmm1,%%xmm2 \n" + "pshufb %%xmm4,%%xmm2 \n" + "por %%xmm5,%%xmm2 \n" + "palignr $0xc,%%xmm0,%%xmm1 \n" + "pshufb %%xmm4,%%xmm0 \n" + "movdqa %%xmm2," MEMACCESS2(0x20,1) " \n" + "por %%xmm5,%%xmm0 \n" + "pshufb %%xmm4,%%xmm1 \n" + "movdqa %%xmm0," MEMACCESS(1) " \n" + "por %%xmm5,%%xmm1 \n" + "palignr $0x4,%%xmm3,%%xmm3 \n" + "pshufb %%xmm4,%%xmm3 \n" + "movdqa %%xmm1," MEMACCESS2(0x10,1) " \n" + "por %%xmm5,%%xmm3 \n" + "sub $0x10,%2 \n" + "movdqa %%xmm3," MEMACCESS2(0x30,1) " \n" + "lea " MEMLEA(0x40,1) ",%1 \n" + "jg 1b \n" + : "+r"(src_raw), // %0 + "+r"(dst_argb), // %1 + "+r"(pix) // %2 + : "m"(kShuffleMaskRAWToARGB) // %3 + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" +#endif + ); +} + +void RGB565ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) { + asm volatile ( + "mov $0x1080108,%%eax \n" + "movd %%eax,%%xmm5 \n" + "pshufd $0x0,%%xmm5,%%xmm5 \n" + "mov $0x20802080,%%eax \n" + "movd %%eax,%%xmm6 \n" + "pshufd $0x0,%%xmm6,%%xmm6 \n" + "pcmpeqb %%xmm3,%%xmm3 \n" + "psllw $0xb,%%xmm3 \n" + "pcmpeqb %%xmm4,%%xmm4 \n" + "psllw $0xa,%%xmm4 \n" + "psrlw $0x5,%%xmm4 \n" + "pcmpeqb %%xmm7,%%xmm7 \n" + "psllw $0x8,%%xmm7 \n" + "sub %0,%1 \n" + "sub %0,%1 \n" + LABELALIGN + "1: \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm0,%%xmm2 \n" + "pand %%xmm3,%%xmm1 \n" + "psllw $0xb,%%xmm2 \n" + "pmulhuw %%xmm5,%%xmm1 \n" + "pmulhuw %%xmm5,%%xmm2 \n" + "psllw $0x8,%%xmm1 \n" + "por %%xmm2,%%xmm1 \n" + "pand %%xmm4,%%xmm0 \n" + "pmulhuw %%xmm6,%%xmm0 \n" + "por %%xmm7,%%xmm0 \n" + "movdqa %%xmm1,%%xmm2 \n" + "punpcklbw %%xmm0,%%xmm1 \n" + "punpckhbw %%xmm0,%%xmm2 \n" + BUNDLEALIGN + MEMOPMEM(movdqa,xmm1,0x00,1,0,2) // movdqa %%xmm1,(%1,%0,2) + MEMOPMEM(movdqa,xmm2,0x10,1,0,2) // movdqa %%xmm2,0x10(%1,%0,2) + "lea " MEMLEA(0x10,0) ",%0 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(pix) // %2 + : + : "memory", "cc", "eax" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" +#endif + ); +} + +void ARGB1555ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) { + asm volatile ( + "mov $0x1080108,%%eax \n" + "movd %%eax,%%xmm5 \n" + "pshufd $0x0,%%xmm5,%%xmm5 \n" + "mov $0x42004200,%%eax \n" + "movd %%eax,%%xmm6 \n" + "pshufd $0x0,%%xmm6,%%xmm6 \n" + "pcmpeqb %%xmm3,%%xmm3 \n" + "psllw $0xb,%%xmm3 \n" + "movdqa %%xmm3,%%xmm4 \n" + "psrlw $0x6,%%xmm4 \n" + "pcmpeqb %%xmm7,%%xmm7 \n" + "psllw $0x8,%%xmm7 \n" + "sub %0,%1 \n" + "sub %0,%1 \n" + LABELALIGN + "1: \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm0,%%xmm2 \n" + "psllw $0x1,%%xmm1 \n" + "psllw $0xb,%%xmm2 \n" + "pand %%xmm3,%%xmm1 \n" + "pmulhuw %%xmm5,%%xmm2 \n" + "pmulhuw %%xmm5,%%xmm1 \n" + "psllw $0x8,%%xmm1 \n" + "por %%xmm2,%%xmm1 \n" + "movdqa %%xmm0,%%xmm2 \n" + "pand %%xmm4,%%xmm0 \n" + "psraw $0x8,%%xmm2 \n" + "pmulhuw %%xmm6,%%xmm0 \n" + "pand %%xmm7,%%xmm2 \n" + "por %%xmm2,%%xmm0 \n" + "movdqa %%xmm1,%%xmm2 \n" + "punpcklbw %%xmm0,%%xmm1 \n" + "punpckhbw %%xmm0,%%xmm2 \n" + BUNDLEALIGN + MEMOPMEM(movdqa,xmm1,0x00,1,0,2) // movdqa %%xmm1,(%1,%0,2) + MEMOPMEM(movdqa,xmm2,0x10,1,0,2) // movdqa %%xmm2,0x10(%1,%0,2) + "lea " MEMLEA(0x10,0) ",%0 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(pix) // %2 + : + : "memory", "cc", "eax" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" +#endif + ); +} + +void ARGB4444ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) { + asm volatile ( + "mov $0xf0f0f0f,%%eax \n" + "movd %%eax,%%xmm4 \n" + "pshufd $0x0,%%xmm4,%%xmm4 \n" + "movdqa %%xmm4,%%xmm5 \n" + "pslld $0x4,%%xmm5 \n" + "sub %0,%1 \n" + "sub %0,%1 \n" + LABELALIGN + "1: \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqa %%xmm0,%%xmm2 \n" + "pand %%xmm4,%%xmm0 \n" + "pand %%xmm5,%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm2,%%xmm3 \n" + "psllw $0x4,%%xmm1 \n" + "psrlw $0x4,%%xmm3 \n" + "por %%xmm1,%%xmm0 \n" + "por %%xmm3,%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklbw %%xmm2,%%xmm0 \n" + "punpckhbw %%xmm2,%%xmm1 \n" + BUNDLEALIGN + MEMOPMEM(movdqa,xmm0,0x00,1,0,2) // movdqa %%xmm0,(%1,%0,2) + MEMOPMEM(movdqa,xmm1,0x10,1,0,2) // movdqa %%xmm1,0x10(%1,%0,2) + "lea " MEMLEA(0x10,0) ",%0 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(pix) // %2 + : + : "memory", "cc", "eax" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" +#endif + ); +} + +void ARGBToRGB24Row_SSSE3(const uint8* src, uint8* dst, int pix) { + asm volatile ( + "movdqa %3,%%xmm6 \n" + LABELALIGN + "1: \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" + "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n" + "lea " MEMLEA(0x40,0) ",%0 \n" + "pshufb %%xmm6,%%xmm0 \n" + "pshufb %%xmm6,%%xmm1 \n" + "pshufb %%xmm6,%%xmm2 \n" + "pshufb %%xmm6,%%xmm3 \n" + "movdqa %%xmm1,%%xmm4 \n" + "psrldq $0x4,%%xmm1 \n" + "pslldq $0xc,%%xmm4 \n" + "movdqa %%xmm2,%%xmm5 \n" + "por %%xmm4,%%xmm0 \n" + "pslldq $0x8,%%xmm5 \n" + "movdqu %%xmm0," MEMACCESS(1) " \n" + "por %%xmm5,%%xmm1 \n" + "psrldq $0x8,%%xmm2 \n" + "pslldq $0x4,%%xmm3 \n" + "por %%xmm3,%%xmm2 \n" + "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" + "movdqu %%xmm2," MEMACCESS2(0x20,1) " \n" + "lea " MEMLEA(0x30,1) ",%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(pix) // %2 + : "m"(kShuffleMaskARGBToRGB24) // %3 + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" +#endif + ); +} + +void ARGBToRAWRow_SSSE3(const uint8* src, uint8* dst, int pix) { + asm volatile ( + "movdqa %3,%%xmm6 \n" + LABELALIGN + "1: \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" + "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n" + "lea " MEMLEA(0x40,0) ",%0 \n" + "pshufb %%xmm6,%%xmm0 \n" + "pshufb %%xmm6,%%xmm1 \n" + "pshufb %%xmm6,%%xmm2 \n" + "pshufb %%xmm6,%%xmm3 \n" + "movdqa %%xmm1,%%xmm4 \n" + "psrldq $0x4,%%xmm1 \n" + "pslldq $0xc,%%xmm4 \n" + "movdqa %%xmm2,%%xmm5 \n" + "por %%xmm4,%%xmm0 \n" + "pslldq $0x8,%%xmm5 \n" + "movdqu %%xmm0," MEMACCESS(1) " \n" + "por %%xmm5,%%xmm1 \n" + "psrldq $0x8,%%xmm2 \n" + "pslldq $0x4,%%xmm3 \n" + "por %%xmm3,%%xmm2 \n" + "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" + "movdqu %%xmm2," MEMACCESS2(0x20,1) " \n" + "lea " MEMLEA(0x30,1) ",%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(pix) // %2 + : "m"(kShuffleMaskARGBToRAW) // %3 + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" +#endif + ); +} + +void ARGBToRGB565Row_SSE2(const uint8* src, uint8* dst, int pix) { + asm volatile ( + "pcmpeqb %%xmm3,%%xmm3 \n" + "psrld $0x1b,%%xmm3 \n" + "pcmpeqb %%xmm4,%%xmm4 \n" + "psrld $0x1a,%%xmm4 \n" + "pslld $0x5,%%xmm4 \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + "pslld $0xb,%%xmm5 \n" + LABELALIGN + "1: \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm0,%%xmm2 \n" + "pslld $0x8,%%xmm0 \n" + "psrld $0x3,%%xmm1 \n" + "psrld $0x5,%%xmm2 \n" + "psrad $0x10,%%xmm0 \n" + "pand %%xmm3,%%xmm1 \n" + "pand %%xmm4,%%xmm2 \n" + "pand %%xmm5,%%xmm0 \n" + "por %%xmm2,%%xmm1 \n" + "por %%xmm1,%%xmm0 \n" + "packssdw %%xmm0,%%xmm0 \n" + "lea " MEMLEA(0x10,0) ",%0 \n" + "movq %%xmm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x8,1) ",%1 \n" + "sub $0x4,%2 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(pix) // %2 + : + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" +#endif + ); +} + +void ARGBToARGB1555Row_SSE2(const uint8* src, uint8* dst, int pix) { + asm volatile ( + "pcmpeqb %%xmm4,%%xmm4 \n" + "psrld $0x1b,%%xmm4 \n" + "movdqa %%xmm4,%%xmm5 \n" + "pslld $0x5,%%xmm5 \n" + "movdqa %%xmm4,%%xmm6 \n" + "pslld $0xa,%%xmm6 \n" + "pcmpeqb %%xmm7,%%xmm7 \n" + "pslld $0xf,%%xmm7 \n" + LABELALIGN + "1: \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm0,%%xmm2 \n" + "movdqa %%xmm0,%%xmm3 \n" + "psrad $0x10,%%xmm0 \n" + "psrld $0x3,%%xmm1 \n" + "psrld $0x6,%%xmm2 \n" + "psrld $0x9,%%xmm3 \n" + "pand %%xmm7,%%xmm0 \n" + "pand %%xmm4,%%xmm1 \n" + "pand %%xmm5,%%xmm2 \n" + "pand %%xmm6,%%xmm3 \n" + "por %%xmm1,%%xmm0 \n" + "por %%xmm3,%%xmm2 \n" + "por %%xmm2,%%xmm0 \n" + "packssdw %%xmm0,%%xmm0 \n" + "lea " MEMLEA(0x10,0) ",%0 \n" + "movq %%xmm0," MEMACCESS(1) " \n" + "lea " MEMACCESS2(0x8,1) ",%1 \n" + "sub $0x4,%2 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(pix) // %2 + : + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" +#endif + ); +} + +void ARGBToARGB4444Row_SSE2(const uint8* src, uint8* dst, int pix) { + asm volatile ( + "pcmpeqb %%xmm4,%%xmm4 \n" + "psllw $0xc,%%xmm4 \n" + "movdqa %%xmm4,%%xmm3 \n" + "psrlw $0x8,%%xmm3 \n" + LABELALIGN + "1: \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "pand %%xmm3,%%xmm0 \n" + "pand %%xmm4,%%xmm1 \n" + "psrlq $0x4,%%xmm0 \n" + "psrlq $0x8,%%xmm1 \n" + "por %%xmm1,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "lea " MEMLEA(0x10,0) ",%0 \n" + "movq %%xmm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x8,1) ",%1 \n" + "sub $0x4,%2 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(pix) // %2 + : + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4" +#endif + ); +} +#endif // HAS_RGB24TOARGBROW_SSSE3 + +#ifdef HAS_ARGBTOYROW_SSSE3 +void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { + asm volatile ( + "movdqa %4,%%xmm5 \n" + "movdqa %3,%%xmm4 \n" + LABELALIGN + "1: \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n" + "movdqa " MEMACCESS2(0x30,0) ",%%xmm3 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm1 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm4,%%xmm3 \n" + "lea " MEMLEA(0x40,0) ",%0 \n" + "phaddw %%xmm1,%%xmm0 \n" + "phaddw %%xmm3,%%xmm2 \n" + "psrlw $0x7,%%xmm0 \n" + "psrlw $0x7,%%xmm2 \n" + "packuswb %%xmm2,%%xmm0 \n" + "paddb %%xmm5,%%xmm0 \n" + "sub $0x10,%2 \n" + "movdqa %%xmm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x10,1) ",%1 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_y), // %1 + "+r"(pix) // %2 + : "m"(kARGBToY), // %3 + "m"(kAddY16) // %4 + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" +#endif + ); +} + +void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { + asm volatile ( + "movdqa %4,%%xmm5 \n" + "movdqa %3,%%xmm4 \n" + LABELALIGN + "1: \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" + "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm1 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm4,%%xmm3 \n" + "lea " MEMLEA(0x40,0) ",%0 \n" + "phaddw %%xmm1,%%xmm0 \n" + "phaddw %%xmm3,%%xmm2 \n" + "psrlw $0x7,%%xmm0 \n" + "psrlw $0x7,%%xmm2 \n" + "packuswb %%xmm2,%%xmm0 \n" + "paddb %%xmm5,%%xmm0 \n" + "sub $0x10,%2 \n" + "movdqu %%xmm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x10,1) ",%1 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_y), // %1 + "+r"(pix) // %2 + : "m"(kARGBToY), // %3 + "m"(kAddY16) // %4 + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" +#endif + ); +} +#endif // HAS_ARGBTOYROW_SSSE3 + +#ifdef HAS_ARGBTOYJROW_SSSE3 +void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { + asm volatile ( + "movdqa %3,%%xmm4 \n" + "movdqa %4,%%xmm5 \n" + LABELALIGN + "1: \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n" + "movdqa " MEMACCESS2(0x30,0) ",%%xmm3 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm1 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm4,%%xmm3 \n" + "lea " MEMLEA(0x40,0) ",%0 \n" + "phaddw %%xmm1,%%xmm0 \n" + "phaddw %%xmm3,%%xmm2 \n" + "paddw %%xmm5,%%xmm0 \n" + "paddw %%xmm5,%%xmm2 \n" + "psrlw $0x7,%%xmm0 \n" + "psrlw $0x7,%%xmm2 \n" + "packuswb %%xmm2,%%xmm0 \n" + "sub $0x10,%2 \n" + "movdqa %%xmm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x10,1) ",%1 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_y), // %1 + "+r"(pix) // %2 + : "m"(kARGBToYJ), // %3 + "m"(kAddYJ64) // %4 + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" +#endif + ); +} + +void ARGBToYJRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { + asm volatile ( + "movdqa %3,%%xmm4 \n" + "movdqa %4,%%xmm5 \n" + LABELALIGN + "1: \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" + "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm1 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm4,%%xmm3 \n" + "lea " MEMLEA(0x40,0) ",%0 \n" + "phaddw %%xmm1,%%xmm0 \n" + "phaddw %%xmm3,%%xmm2 \n" + "paddw %%xmm5,%%xmm0 \n" + "paddw %%xmm5,%%xmm2 \n" + "psrlw $0x7,%%xmm0 \n" + "psrlw $0x7,%%xmm2 \n" + "packuswb %%xmm2,%%xmm0 \n" + "sub $0x10,%2 \n" + "movdqu %%xmm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x10,1) ",%1 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_y), // %1 + "+r"(pix) // %2 + : "m"(kARGBToYJ), // %3 + "m"(kAddYJ64) // %4 + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" +#endif + ); +} +#endif // HAS_ARGBTOYJROW_SSSE3 + +#ifdef HAS_ARGBTOUVROW_SSSE3 +// TODO(fbarchard): pass xmm constants to single block of assembly. +// fpic on GCC 4.2 for OSX runs out of GPR registers. "m" effectively takes +// 3 registers - ebx, ebp and eax. "m" can be passed with 3 normal registers, +// or 4 if stack frame is disabled. Doing 2 assembly blocks is a work around +// and considered unsafe. +void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width) { + asm volatile ( + "movdqa %0,%%xmm4 \n" + "movdqa %1,%%xmm3 \n" + "movdqa %2,%%xmm5 \n" + : + : "m"(kARGBToU), // %0 + "m"(kARGBToV), // %1 + "m"(kAddUV128) // %2 + ); + asm volatile ( + "sub %1,%2 \n" + LABELALIGN + "1: \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n" + "movdqa " MEMACCESS2(0x30,0) ",%%xmm6 \n" + BUNDLEALIGN + MEMOPREG(pavgb,0x00,0,4,1,xmm0) // pavgb (%0,%4,1),%%xmm0 + MEMOPREG(pavgb,0x10,0,4,1,xmm1) // pavgb 0x10(%0,%4,1),%%xmm1 + MEMOPREG(pavgb,0x20,0,4,1,xmm2) // pavgb 0x20(%0,%4,1),%%xmm2 + MEMOPREG(pavgb,0x30,0,4,1,xmm6) // pavgb 0x30(%0,%4,1),%%xmm6 + "lea " MEMLEA(0x40,0) ",%0 \n" + "movdqa %%xmm0,%%xmm7 \n" + "shufps $0x88,%%xmm1,%%xmm0 \n" + "shufps $0xdd,%%xmm1,%%xmm7 \n" + "pavgb %%xmm7,%%xmm0 \n" + "movdqa %%xmm2,%%xmm7 \n" + "shufps $0x88,%%xmm6,%%xmm2 \n" + "shufps $0xdd,%%xmm6,%%xmm7 \n" + "pavgb %%xmm7,%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm2,%%xmm6 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm3,%%xmm1 \n" + "pmaddubsw %%xmm3,%%xmm6 \n" + "phaddw %%xmm2,%%xmm0 \n" + "phaddw %%xmm6,%%xmm1 \n" + "psraw $0x8,%%xmm0 \n" + "psraw $0x8,%%xmm1 \n" + "packsswb %%xmm1,%%xmm0 \n" + "paddb %%xmm5,%%xmm0 \n" + "sub $0x10,%3 \n" + "movlps %%xmm0," MEMACCESS(1) " \n" + BUNDLEALIGN + MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1) + "lea " MEMLEA(0x8,1) ",%1 \n" + "jg 1b \n" + : "+r"(src_argb0), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+rm"(width) // %3 + : "r"((intptr_t)(src_stride_argb)) // %4 + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" +#endif + ); +} + +// TODO(fbarchard): Share code with ARGBToUVRow_SSSE3. +void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width) { + asm volatile ( + "movdqa %0,%%xmm4 \n" + "movdqa %1,%%xmm3 \n" + "movdqa %2,%%xmm5 \n" + : + : "m"(kARGBToUJ), // %0 + "m"(kARGBToVJ), // %1 + "m"(kAddUVJ128) // %2 + ); + asm volatile ( + "sub %1,%2 \n" + LABELALIGN + "1: \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n" + "movdqa " MEMACCESS2(0x30,0) ",%%xmm6 \n" + BUNDLEALIGN + MEMOPREG(pavgb,0x00,0,4,1,xmm0) // pavgb (%0,%4,1),%%xmm0 + MEMOPREG(pavgb,0x10,0,4,1,xmm1) // pavgb 0x10(%0,%4,1),%%xmm1 + MEMOPREG(pavgb,0x20,0,4,1,xmm2) // pavgb 0x20(%0,%4,1),%%xmm2 + MEMOPREG(pavgb,0x30,0,4,1,xmm6) // pavgb 0x30(%0,%4,1),%%xmm6 + "lea " MEMLEA(0x40,0) ",%0 \n" + "movdqa %%xmm0,%%xmm7 \n" + "shufps $0x88,%%xmm1,%%xmm0 \n" + "shufps $0xdd,%%xmm1,%%xmm7 \n" + "pavgb %%xmm7,%%xmm0 \n" + "movdqa %%xmm2,%%xmm7 \n" + "shufps $0x88,%%xmm6,%%xmm2 \n" + "shufps $0xdd,%%xmm6,%%xmm7 \n" + "pavgb %%xmm7,%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm2,%%xmm6 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm3,%%xmm1 \n" + "pmaddubsw %%xmm3,%%xmm6 \n" + "phaddw %%xmm2,%%xmm0 \n" + "phaddw %%xmm6,%%xmm1 \n" + "paddw %%xmm5,%%xmm0 \n" + "paddw %%xmm5,%%xmm1 \n" + "psraw $0x8,%%xmm0 \n" + "psraw $0x8,%%xmm1 \n" + "packsswb %%xmm1,%%xmm0 \n" + "sub $0x10,%3 \n" + "movlps %%xmm0," MEMACCESS(1) " \n" + BUNDLEALIGN + MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1) + "lea " MEMLEA(0x8,1) ",%1 \n" + "jg 1b \n" + : "+r"(src_argb0), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+rm"(width) // %3 + : "r"((intptr_t)(src_stride_argb)) // %4 + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" +#endif + ); +} + +void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width) { + asm volatile ( + "movdqa %0,%%xmm4 \n" + "movdqa %1,%%xmm3 \n" + "movdqa %2,%%xmm5 \n" + : + : "m"(kARGBToU), // %0 + "m"(kARGBToV), // %1 + "m"(kAddUV128) // %2 + ); + asm volatile ( + "sub %1,%2 \n" + LABELALIGN + "1: \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" + "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n" + BUNDLEALIGN + MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7 + "pavgb %%xmm7,%%xmm0 \n" + MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7 + "pavgb %%xmm7,%%xmm1 \n" + MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7 + "pavgb %%xmm7,%%xmm2 \n" + MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7 + "pavgb %%xmm7,%%xmm6 \n" + "lea " MEMLEA(0x40,0) ",%0 \n" + "movdqa %%xmm0,%%xmm7 \n" + "shufps $0x88,%%xmm1,%%xmm0 \n" + "shufps $0xdd,%%xmm1,%%xmm7 \n" + "pavgb %%xmm7,%%xmm0 \n" + "movdqa %%xmm2,%%xmm7 \n" + "shufps $0x88,%%xmm6,%%xmm2 \n" + "shufps $0xdd,%%xmm6,%%xmm7 \n" + "pavgb %%xmm7,%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm2,%%xmm6 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm3,%%xmm1 \n" + "pmaddubsw %%xmm3,%%xmm6 \n" + "phaddw %%xmm2,%%xmm0 \n" + "phaddw %%xmm6,%%xmm1 \n" + "psraw $0x8,%%xmm0 \n" + "psraw $0x8,%%xmm1 \n" + "packsswb %%xmm1,%%xmm0 \n" + "paddb %%xmm5,%%xmm0 \n" + "sub $0x10,%3 \n" + "movlps %%xmm0," MEMACCESS(1) " \n" + BUNDLEALIGN + MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1) + "lea " MEMLEA(0x8,1) ",%1 \n" + "jg 1b \n" + : "+r"(src_argb0), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+rm"(width) // %3 + : "r"((intptr_t)(src_stride_argb)) // %4 + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" +#endif + ); +} + +void ARGBToUVJRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width) { + asm volatile ( + "movdqa %0,%%xmm4 \n" + "movdqa %1,%%xmm3 \n" + "movdqa %2,%%xmm5 \n" + : + : "m"(kARGBToUJ), // %0 + "m"(kARGBToVJ), // %1 + "m"(kAddUVJ128) // %2 + ); + asm volatile ( + "sub %1,%2 \n" + LABELALIGN + "1: \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" + "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n" + BUNDLEALIGN + MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7 + "pavgb %%xmm7,%%xmm0 \n" + MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7 + "pavgb %%xmm7,%%xmm1 \n" + MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7 + "pavgb %%xmm7,%%xmm2 \n" + MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7 + "pavgb %%xmm7,%%xmm6 \n" + "lea " MEMLEA(0x40,0) ",%0 \n" + "movdqa %%xmm0,%%xmm7 \n" + "shufps $0x88,%%xmm1,%%xmm0 \n" + "shufps $0xdd,%%xmm1,%%xmm7 \n" + "pavgb %%xmm7,%%xmm0 \n" + "movdqa %%xmm2,%%xmm7 \n" + "shufps $0x88,%%xmm6,%%xmm2 \n" + "shufps $0xdd,%%xmm6,%%xmm7 \n" + "pavgb %%xmm7,%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm2,%%xmm6 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm3,%%xmm1 \n" + "pmaddubsw %%xmm3,%%xmm6 \n" + "phaddw %%xmm2,%%xmm0 \n" + "phaddw %%xmm6,%%xmm1 \n" + "paddw %%xmm5,%%xmm0 \n" + "paddw %%xmm5,%%xmm1 \n" + "psraw $0x8,%%xmm0 \n" + "psraw $0x8,%%xmm1 \n" + "packsswb %%xmm1,%%xmm0 \n" + "sub $0x10,%3 \n" + "movlps %%xmm0," MEMACCESS(1) " \n" + BUNDLEALIGN + MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1) + "lea " MEMLEA(0x8,1) ",%1 \n" + "jg 1b \n" + : "+r"(src_argb0), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+rm"(width) // %3 + : "r"((intptr_t)(src_stride_argb)) + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" +#endif + ); +} + +void ARGBToUV444Row_SSSE3(const uint8* src_argb, uint8* dst_u, uint8* dst_v, + int width) { + asm volatile ( + "movdqa %0,%%xmm4 \n" + "movdqa %1,%%xmm3 \n" + "movdqa %2,%%xmm5 \n" + : + : "m"(kARGBToU), // %0 + "m"(kARGBToV), // %1 + "m"(kAddUV128) // %2 + ); + asm volatile ( + "sub %1,%2 \n" + LABELALIGN + "1: \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n" + "movdqa " MEMACCESS2(0x30,0) ",%%xmm6 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm1 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm4,%%xmm6 \n" + "phaddw %%xmm1,%%xmm0 \n" + "phaddw %%xmm6,%%xmm2 \n" + "psraw $0x8,%%xmm0 \n" + "psraw $0x8,%%xmm2 \n" + "packsswb %%xmm2,%%xmm0 \n" + "paddb %%xmm5,%%xmm0 \n" + "sub $0x10,%3 \n" + "movdqa %%xmm0," MEMACCESS(1) " \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n" + "movdqa " MEMACCESS2(0x30,0) ",%%xmm6 \n" + "pmaddubsw %%xmm3,%%xmm0 \n" + "pmaddubsw %%xmm3,%%xmm1 \n" + "pmaddubsw %%xmm3,%%xmm2 \n" + "pmaddubsw %%xmm3,%%xmm6 \n" + "phaddw %%xmm1,%%xmm0 \n" + "phaddw %%xmm6,%%xmm2 \n" + "psraw $0x8,%%xmm0 \n" + "psraw $0x8,%%xmm2 \n" + "packsswb %%xmm2,%%xmm0 \n" + "paddb %%xmm5,%%xmm0 \n" + "lea " MEMLEA(0x40,0) ",%0 \n" + BUNDLEALIGN + MEMOPMEM(movdqa,xmm0,0x00,1,2,1) // movdqa %%xmm0,(%1,%2,1) + "lea " MEMLEA(0x10,1) ",%1 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+rm"(width) // %3 + : + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm6" +#endif + ); +} + +void ARGBToUV444Row_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_u, + uint8* dst_v, int width) { + asm volatile ( + "movdqa %0,%%xmm4 \n" + "movdqa %1,%%xmm3 \n" + "movdqa %2,%%xmm5 \n" + : + : "m"(kARGBToU), // %0 + "m"(kARGBToV), // %1 + "m"(kAddUV128) // %2 + ); + asm volatile ( + "sub %1,%2 \n" + LABELALIGN + "1: \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" + "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm1 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm4,%%xmm6 \n" + "phaddw %%xmm1,%%xmm0 \n" + "phaddw %%xmm6,%%xmm2 \n" + "psraw $0x8,%%xmm0 \n" + "psraw $0x8,%%xmm2 \n" + "packsswb %%xmm2,%%xmm0 \n" + "paddb %%xmm5,%%xmm0 \n" + "sub $0x10,%3 \n" + "movdqu %%xmm0," MEMACCESS(1) " \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" + "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n" + "pmaddubsw %%xmm3,%%xmm0 \n" + "pmaddubsw %%xmm3,%%xmm1 \n" + "pmaddubsw %%xmm3,%%xmm2 \n" + "pmaddubsw %%xmm3,%%xmm6 \n" + "phaddw %%xmm1,%%xmm0 \n" + "phaddw %%xmm6,%%xmm2 \n" + "psraw $0x8,%%xmm0 \n" + "psraw $0x8,%%xmm2 \n" + "packsswb %%xmm2,%%xmm0 \n" + "paddb %%xmm5,%%xmm0 \n" + "lea " MEMLEA(0x40,0) ",%0 \n" + BUNDLEALIGN + MEMOPMEM(movdqu,xmm0,0x00,1,2,1) // movdqu %%xmm0,(%1,%2,1) + "lea " MEMLEA(0x10,1) ",%1 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+rm"(width) // %3 + : + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm6" +#endif + ); +} + +void ARGBToUV422Row_SSSE3(const uint8* src_argb0, + uint8* dst_u, uint8* dst_v, int width) { + asm volatile ( + "movdqa %0,%%xmm4 \n" + "movdqa %1,%%xmm3 \n" + "movdqa %2,%%xmm5 \n" + : + : "m"(kARGBToU), // %0 + "m"(kARGBToV), // %1 + "m"(kAddUV128) // %2 + ); + asm volatile ( + "sub %1,%2 \n" + LABELALIGN + "1: \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n" + "movdqa " MEMACCESS2(0x30,0) ",%%xmm6 \n" + "lea " MEMLEA(0x40,0) ",%0 \n" + "movdqa %%xmm0,%%xmm7 \n" + "shufps $0x88,%%xmm1,%%xmm0 \n" + "shufps $0xdd,%%xmm1,%%xmm7 \n" + "pavgb %%xmm7,%%xmm0 \n" + "movdqa %%xmm2,%%xmm7 \n" + "shufps $0x88,%%xmm6,%%xmm2 \n" + "shufps $0xdd,%%xmm6,%%xmm7 \n" + "pavgb %%xmm7,%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm2,%%xmm6 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm3,%%xmm1 \n" + "pmaddubsw %%xmm3,%%xmm6 \n" + "phaddw %%xmm2,%%xmm0 \n" + "phaddw %%xmm6,%%xmm1 \n" + "psraw $0x8,%%xmm0 \n" + "psraw $0x8,%%xmm1 \n" + "packsswb %%xmm1,%%xmm0 \n" + "paddb %%xmm5,%%xmm0 \n" + "sub $0x10,%3 \n" + "movlps %%xmm0," MEMACCESS(1) " \n" + BUNDLEALIGN + MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1) + "lea " MEMLEA(0x8,1) ",%1 \n" + "jg 1b \n" + : "+r"(src_argb0), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+rm"(width) // %3 + : + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" +#endif + ); +} + +void ARGBToUV422Row_Unaligned_SSSE3(const uint8* src_argb0, + uint8* dst_u, uint8* dst_v, int width) { + asm volatile ( + "movdqa %0,%%xmm4 \n" + "movdqa %1,%%xmm3 \n" + "movdqa %2,%%xmm5 \n" + : + : "m"(kARGBToU), // %0 + "m"(kARGBToV), // %1 + "m"(kAddUV128) // %2 + ); + asm volatile ( + "sub %1,%2 \n" + LABELALIGN + "1: \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" + "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n" + "lea " MEMLEA(0x40,0) ",%0 \n" + "movdqa %%xmm0,%%xmm7 \n" + "shufps $0x88,%%xmm1,%%xmm0 \n" + "shufps $0xdd,%%xmm1,%%xmm7 \n" + "pavgb %%xmm7,%%xmm0 \n" + "movdqa %%xmm2,%%xmm7 \n" + "shufps $0x88,%%xmm6,%%xmm2 \n" + "shufps $0xdd,%%xmm6,%%xmm7 \n" + "pavgb %%xmm7,%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm2,%%xmm6 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm3,%%xmm1 \n" + "pmaddubsw %%xmm3,%%xmm6 \n" + "phaddw %%xmm2,%%xmm0 \n" + "phaddw %%xmm6,%%xmm1 \n" + "psraw $0x8,%%xmm0 \n" + "psraw $0x8,%%xmm1 \n" + "packsswb %%xmm1,%%xmm0 \n" + "paddb %%xmm5,%%xmm0 \n" + "sub $0x10,%3 \n" + "movlps %%xmm0," MEMACCESS(1) " \n" + BUNDLEALIGN + MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1) + "lea " MEMLEA(0x8,1) ",%1 \n" + "jg 1b \n" + : "+r"(src_argb0), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+rm"(width) // %3 + : + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" +#endif + ); +} + +void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) { + asm volatile ( + "movdqa %4,%%xmm5 \n" + "movdqa %3,%%xmm4 \n" + LABELALIGN + "1: \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n" + "movdqa " MEMACCESS2(0x30,0) ",%%xmm3 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm1 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm4,%%xmm3 \n" + "lea " MEMLEA(0x40,0) ",%0 \n" + "phaddw %%xmm1,%%xmm0 \n" + "phaddw %%xmm3,%%xmm2 \n" + "psrlw $0x7,%%xmm0 \n" + "psrlw $0x7,%%xmm2 \n" + "packuswb %%xmm2,%%xmm0 \n" + "paddb %%xmm5,%%xmm0 \n" + "sub $0x10,%2 \n" + "movdqa %%xmm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x10,1) ",%1 \n" + "jg 1b \n" + : "+r"(src_bgra), // %0 + "+r"(dst_y), // %1 + "+r"(pix) // %2 + : "m"(kBGRAToY), // %3 + "m"(kAddY16) // %4 + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" +#endif + ); +} + +void BGRAToYRow_Unaligned_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) { + asm volatile ( + "movdqa %4,%%xmm5 \n" + "movdqa %3,%%xmm4 \n" + LABELALIGN + "1: \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" + "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm1 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm4,%%xmm3 \n" + "lea " MEMLEA(0x40,0) ",%0 \n" + "phaddw %%xmm1,%%xmm0 \n" + "phaddw %%xmm3,%%xmm2 \n" + "psrlw $0x7,%%xmm0 \n" + "psrlw $0x7,%%xmm2 \n" + "packuswb %%xmm2,%%xmm0 \n" + "paddb %%xmm5,%%xmm0 \n" + "sub $0x10,%2 \n" + "movdqu %%xmm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x10,1) ",%1 \n" + "jg 1b \n" + : "+r"(src_bgra), // %0 + "+r"(dst_y), // %1 + "+r"(pix) // %2 + : "m"(kBGRAToY), // %3 + "m"(kAddY16) // %4 + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" +#endif + ); +} + +void BGRAToUVRow_SSSE3(const uint8* src_bgra0, int src_stride_bgra, + uint8* dst_u, uint8* dst_v, int width) { + asm volatile ( + "movdqa %0,%%xmm4 \n" + "movdqa %1,%%xmm3 \n" + "movdqa %2,%%xmm5 \n" + : + : "m"(kBGRAToU), // %0 + "m"(kBGRAToV), // %1 + "m"(kAddUV128) // %2 + ); + asm volatile ( + "sub %1,%2 \n" + LABELALIGN + "1: \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n" + "movdqa " MEMACCESS2(0x30,0) ",%%xmm6 \n" + BUNDLEALIGN + MEMOPREG(pavgb,0x00,0,4,1,xmm0) // pavgb (%0,%4,1),%%xmm0 + MEMOPREG(pavgb,0x10,0,4,1,xmm1) // pavgb 0x10(%0,%4,1),%%xmm1 + MEMOPREG(pavgb,0x20,0,4,1,xmm2) // pavgb 0x20(%0,%4,1),%%xmm2 + MEMOPREG(pavgb,0x30,0,4,1,xmm6) // pavgb 0x30(%0,%4,1),%%xmm6 + "lea " MEMLEA(0x40,0) ",%0 \n" + "movdqa %%xmm0,%%xmm7 \n" + "shufps $0x88,%%xmm1,%%xmm0 \n" + "shufps $0xdd,%%xmm1,%%xmm7 \n" + "pavgb %%xmm7,%%xmm0 \n" + "movdqa %%xmm2,%%xmm7 \n" + "shufps $0x88,%%xmm6,%%xmm2 \n" + "shufps $0xdd,%%xmm6,%%xmm7 \n" + "pavgb %%xmm7,%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm2,%%xmm6 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm3,%%xmm1 \n" + "pmaddubsw %%xmm3,%%xmm6 \n" + "phaddw %%xmm2,%%xmm0 \n" + "phaddw %%xmm6,%%xmm1 \n" + "psraw $0x8,%%xmm0 \n" + "psraw $0x8,%%xmm1 \n" + "packsswb %%xmm1,%%xmm0 \n" + "paddb %%xmm5,%%xmm0 \n" + "sub $0x10,%3 \n" + "movlps %%xmm0," MEMACCESS(1) " \n" + BUNDLEALIGN + MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1) + "lea " MEMLEA(0x8,1) ",%1 \n" + "jg 1b \n" + : "+r"(src_bgra0), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+rm"(width) // %3 + : "r"((intptr_t)(src_stride_bgra)) // %4 + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" +#endif + ); +} + +void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_bgra0, int src_stride_bgra, + uint8* dst_u, uint8* dst_v, int width) { + asm volatile ( + "movdqa %0,%%xmm4 \n" + "movdqa %1,%%xmm3 \n" + "movdqa %2,%%xmm5 \n" + : + : "m"(kBGRAToU), // %0 + "m"(kBGRAToV), // %1 + "m"(kAddUV128) // %2 + ); + asm volatile ( + "sub %1,%2 \n" + LABELALIGN + "1: \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" + "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n" + BUNDLEALIGN + MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7 + "pavgb %%xmm7,%%xmm0 \n" + MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7 + "pavgb %%xmm7,%%xmm1 \n" + MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7 + "pavgb %%xmm7,%%xmm2 \n" + MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7 + "pavgb %%xmm7,%%xmm6 \n" + "lea " MEMLEA(0x40,0) ",%0 \n" + "movdqa %%xmm0,%%xmm7 \n" + "shufps $0x88,%%xmm1,%%xmm0 \n" + "shufps $0xdd,%%xmm1,%%xmm7 \n" + "pavgb %%xmm7,%%xmm0 \n" + "movdqa %%xmm2,%%xmm7 \n" + "shufps $0x88,%%xmm6,%%xmm2 \n" + "shufps $0xdd,%%xmm6,%%xmm7 \n" + "pavgb %%xmm7,%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm2,%%xmm6 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm3,%%xmm1 \n" + "pmaddubsw %%xmm3,%%xmm6 \n" + "phaddw %%xmm2,%%xmm0 \n" + "phaddw %%xmm6,%%xmm1 \n" + "psraw $0x8,%%xmm0 \n" + "psraw $0x8,%%xmm1 \n" + "packsswb %%xmm1,%%xmm0 \n" + "paddb %%xmm5,%%xmm0 \n" + "sub $0x10,%3 \n" + "movlps %%xmm0," MEMACCESS(1) " \n" + BUNDLEALIGN + MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1) + "lea " MEMLEA(0x8,1) ",%1 \n" + "jg 1b \n" + : "+r"(src_bgra0), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+rm"(width) // %3 + : "r"((intptr_t)(src_stride_bgra)) // %4 + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" +#endif + ); +} + +void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) { + asm volatile ( + "movdqa %4,%%xmm5 \n" + "movdqa %3,%%xmm4 \n" + LABELALIGN + "1: \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n" + "movdqa " MEMACCESS2(0x30,0) ",%%xmm3 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm1 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm4,%%xmm3 \n" + "lea " MEMLEA(0x40,0) ",%0 \n" + "phaddw %%xmm1,%%xmm0 \n" + "phaddw %%xmm3,%%xmm2 \n" + "psrlw $0x7,%%xmm0 \n" + "psrlw $0x7,%%xmm2 \n" + "packuswb %%xmm2,%%xmm0 \n" + "paddb %%xmm5,%%xmm0 \n" + "sub $0x10,%2 \n" + "movdqa %%xmm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x10,1) ",%1 \n" + "jg 1b \n" + : "+r"(src_abgr), // %0 + "+r"(dst_y), // %1 + "+r"(pix) // %2 + : "m"(kABGRToY), // %3 + "m"(kAddY16) // %4 + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" +#endif + ); +} + +void ABGRToYRow_Unaligned_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) { + asm volatile ( + "movdqa %4,%%xmm5 \n" + "movdqa %3,%%xmm4 \n" + LABELALIGN + "1: \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" + "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm1 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm4,%%xmm3 \n" + "lea " MEMLEA(0x40,0) ",%0 \n" + "phaddw %%xmm1,%%xmm0 \n" + "phaddw %%xmm3,%%xmm2 \n" + "psrlw $0x7,%%xmm0 \n" + "psrlw $0x7,%%xmm2 \n" + "packuswb %%xmm2,%%xmm0 \n" + "paddb %%xmm5,%%xmm0 \n" + "sub $0x10,%2 \n" + "movdqu %%xmm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x10,1) ",%1 \n" + "jg 1b \n" + : "+r"(src_abgr), // %0 + "+r"(dst_y), // %1 + "+r"(pix) // %2 + : "m"(kABGRToY), // %3 + "m"(kAddY16) // %4 + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" +#endif + ); +} + +void RGBAToYRow_SSSE3(const uint8* src_rgba, uint8* dst_y, int pix) { + asm volatile ( + "movdqa %4,%%xmm5 \n" + "movdqa %3,%%xmm4 \n" + LABELALIGN + "1: \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n" + "movdqa " MEMACCESS2(0x30,0) ",%%xmm3 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm1 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm4,%%xmm3 \n" + "lea " MEMLEA(0x40,0) ",%0 \n" + "phaddw %%xmm1,%%xmm0 \n" + "phaddw %%xmm3,%%xmm2 \n" + "psrlw $0x7,%%xmm0 \n" + "psrlw $0x7,%%xmm2 \n" + "packuswb %%xmm2,%%xmm0 \n" + "paddb %%xmm5,%%xmm0 \n" + "sub $0x10,%2 \n" + "movdqa %%xmm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x10,1) ",%1 \n" + "jg 1b \n" + : "+r"(src_rgba), // %0 + "+r"(dst_y), // %1 + "+r"(pix) // %2 + : "m"(kRGBAToY), // %3 + "m"(kAddY16) // %4 + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" +#endif + ); +} + +void RGBAToYRow_Unaligned_SSSE3(const uint8* src_rgba, uint8* dst_y, int pix) { + asm volatile ( + "movdqa %4,%%xmm5 \n" + "movdqa %3,%%xmm4 \n" + LABELALIGN + "1: \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" + "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm1 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm4,%%xmm3 \n" + "lea " MEMLEA(0x40,0) ",%0 \n" + "phaddw %%xmm1,%%xmm0 \n" + "phaddw %%xmm3,%%xmm2 \n" + "psrlw $0x7,%%xmm0 \n" + "psrlw $0x7,%%xmm2 \n" + "packuswb %%xmm2,%%xmm0 \n" + "paddb %%xmm5,%%xmm0 \n" + "sub $0x10,%2 \n" + "movdqu %%xmm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x10,1) ",%1 \n" + "jg 1b \n" + : "+r"(src_rgba), // %0 + "+r"(dst_y), // %1 + "+r"(pix) // %2 + : "m"(kRGBAToY), // %3 + "m"(kAddY16) // %4 + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" +#endif + ); +} + +void ABGRToUVRow_SSSE3(const uint8* src_abgr0, int src_stride_abgr, + uint8* dst_u, uint8* dst_v, int width) { + asm volatile ( + "movdqa %0,%%xmm4 \n" + "movdqa %1,%%xmm3 \n" + "movdqa %2,%%xmm5 \n" + : + : "m"(kABGRToU), // %0 + "m"(kABGRToV), // %1 + "m"(kAddUV128) // %2 + ); + asm volatile ( + "sub %1,%2 \n" + LABELALIGN + "1: \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n" + "movdqa " MEMACCESS2(0x30,0) ",%%xmm6 \n" + BUNDLEALIGN + MEMOPREG(pavgb,0x00,0,4,1,xmm0) // pavgb (%0,%4,1),%%xmm0 + MEMOPREG(pavgb,0x10,0,4,1,xmm1) // pavgb 0x10(%0,%4,1),%%xmm1 + MEMOPREG(pavgb,0x20,0,4,1,xmm2) // pavgb 0x20(%0,%4,1),%%xmm2 + MEMOPREG(pavgb,0x30,0,4,1,xmm6) // pavgb 0x30(%0,%4,1),%%xmm6 + "lea " MEMLEA(0x40,0) ",%0 \n" + "movdqa %%xmm0,%%xmm7 \n" + "shufps $0x88,%%xmm1,%%xmm0 \n" + "shufps $0xdd,%%xmm1,%%xmm7 \n" + "pavgb %%xmm7,%%xmm0 \n" + "movdqa %%xmm2,%%xmm7 \n" + "shufps $0x88,%%xmm6,%%xmm2 \n" + "shufps $0xdd,%%xmm6,%%xmm7 \n" + "pavgb %%xmm7,%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm2,%%xmm6 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm3,%%xmm1 \n" + "pmaddubsw %%xmm3,%%xmm6 \n" + "phaddw %%xmm2,%%xmm0 \n" + "phaddw %%xmm6,%%xmm1 \n" + "psraw $0x8,%%xmm0 \n" + "psraw $0x8,%%xmm1 \n" + "packsswb %%xmm1,%%xmm0 \n" + "paddb %%xmm5,%%xmm0 \n" + "sub $0x10,%3 \n" + "movlps %%xmm0," MEMACCESS(1) " \n" + BUNDLEALIGN + MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1) + "lea " MEMLEA(0x8,1) ",%1 \n" + "jg 1b \n" + : "+r"(src_abgr0), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+rm"(width) // %3 + : "r"((intptr_t)(src_stride_abgr)) // %4 + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" +#endif + ); +} + +void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_abgr0, int src_stride_abgr, + uint8* dst_u, uint8* dst_v, int width) { + asm volatile ( + "movdqa %0,%%xmm4 \n" + "movdqa %1,%%xmm3 \n" + "movdqa %2,%%xmm5 \n" + : + : "m"(kABGRToU), // %0 + "m"(kABGRToV), // %1 + "m"(kAddUV128) // %2 + ); + asm volatile ( + "sub %1,%2 \n" + LABELALIGN + "1: \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" + "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n" + BUNDLEALIGN + MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7 + "pavgb %%xmm7,%%xmm0 \n" + MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7 + "pavgb %%xmm7,%%xmm1 \n" + MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7 + "pavgb %%xmm7,%%xmm2 \n" + MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7 + "pavgb %%xmm7,%%xmm6 \n" + "lea " MEMLEA(0x40,0) ",%0 \n" + "movdqa %%xmm0,%%xmm7 \n" + "shufps $0x88,%%xmm1,%%xmm0 \n" + "shufps $0xdd,%%xmm1,%%xmm7 \n" + "pavgb %%xmm7,%%xmm0 \n" + "movdqa %%xmm2,%%xmm7 \n" + "shufps $0x88,%%xmm6,%%xmm2 \n" + "shufps $0xdd,%%xmm6,%%xmm7 \n" + "pavgb %%xmm7,%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm2,%%xmm6 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm3,%%xmm1 \n" + "pmaddubsw %%xmm3,%%xmm6 \n" + "phaddw %%xmm2,%%xmm0 \n" + "phaddw %%xmm6,%%xmm1 \n" + "psraw $0x8,%%xmm0 \n" + "psraw $0x8,%%xmm1 \n" + "packsswb %%xmm1,%%xmm0 \n" + "paddb %%xmm5,%%xmm0 \n" + "sub $0x10,%3 \n" + "movlps %%xmm0," MEMACCESS(1) " \n" + BUNDLEALIGN + MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1) + "lea " MEMLEA(0x8,1) ",%1 \n" + "jg 1b \n" + : "+r"(src_abgr0), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+rm"(width) // %3 + : "r"((intptr_t)(src_stride_abgr)) // %4 + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" +#endif + ); +} + +void RGBAToUVRow_SSSE3(const uint8* src_rgba0, int src_stride_rgba, + uint8* dst_u, uint8* dst_v, int width) { + asm volatile ( + "movdqa %0,%%xmm4 \n" + "movdqa %1,%%xmm3 \n" + "movdqa %2,%%xmm5 \n" + : + : "m"(kRGBAToU), // %0 + "m"(kRGBAToV), // %1 + "m"(kAddUV128) // %2 + ); + asm volatile ( + "sub %1,%2 \n" + LABELALIGN + "1: \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n" + "movdqa " MEMACCESS2(0x30,0) ",%%xmm6 \n" + BUNDLEALIGN + MEMOPREG(pavgb,0x00,0,4,1,xmm0) // pavgb (%0,%4,1),%%xmm0 + MEMOPREG(pavgb,0x10,0,4,1,xmm1) // pavgb 0x10(%0,%4,1),%%xmm1 + MEMOPREG(pavgb,0x20,0,4,1,xmm2) // pavgb 0x20(%0,%4,1),%%xmm2 + MEMOPREG(pavgb,0x30,0,4,1,xmm6) // pavgb 0x30(%0,%4,1),%%xmm6 + "lea " MEMLEA(0x40,0) ",%0 \n" + "movdqa %%xmm0,%%xmm7 \n" + "shufps $0x88,%%xmm1,%%xmm0 \n" + "shufps $0xdd,%%xmm1,%%xmm7 \n" + "pavgb %%xmm7,%%xmm0 \n" + "movdqa %%xmm2,%%xmm7 \n" + "shufps $0x88,%%xmm6,%%xmm2 \n" + "shufps $0xdd,%%xmm6,%%xmm7 \n" + "pavgb %%xmm7,%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm2,%%xmm6 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm3,%%xmm1 \n" + "pmaddubsw %%xmm3,%%xmm6 \n" + "phaddw %%xmm2,%%xmm0 \n" + "phaddw %%xmm6,%%xmm1 \n" + "psraw $0x8,%%xmm0 \n" + "psraw $0x8,%%xmm1 \n" + "packsswb %%xmm1,%%xmm0 \n" + "paddb %%xmm5,%%xmm0 \n" + "sub $0x10,%3 \n" + "movlps %%xmm0," MEMACCESS(1) " \n" + BUNDLEALIGN + MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1) + "lea " MEMLEA(0x8,1) ",%1 \n" + "jg 1b \n" + : "+r"(src_rgba0), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+rm"(width) // %3 + : "r"((intptr_t)(src_stride_rgba)) + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" +#endif + ); +} + +void RGBAToUVRow_Unaligned_SSSE3(const uint8* src_rgba0, int src_stride_rgba, + uint8* dst_u, uint8* dst_v, int width) { + asm volatile ( + "movdqa %0,%%xmm4 \n" + "movdqa %1,%%xmm3 \n" + "movdqa %2,%%xmm5 \n" + : + : "m"(kRGBAToU), // %0 + "m"(kRGBAToV), // %1 + "m"(kAddUV128) // %2 + ); + asm volatile ( + "sub %1,%2 \n" + LABELALIGN + "1: \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" + "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n" + BUNDLEALIGN + MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7 + "pavgb %%xmm7,%%xmm0 \n" + MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7 + "pavgb %%xmm7,%%xmm1 \n" + MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7 + "pavgb %%xmm7,%%xmm2 \n" + MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7 + "pavgb %%xmm7,%%xmm6 \n" + "lea " MEMLEA(0x40,0) ",%0 \n" + "movdqa %%xmm0,%%xmm7 \n" + "shufps $0x88,%%xmm1,%%xmm0 \n" + "shufps $0xdd,%%xmm1,%%xmm7 \n" + "pavgb %%xmm7,%%xmm0 \n" + "movdqa %%xmm2,%%xmm7 \n" + "shufps $0x88,%%xmm6,%%xmm2 \n" + "shufps $0xdd,%%xmm6,%%xmm7 \n" + "pavgb %%xmm7,%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm2,%%xmm6 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm3,%%xmm1 \n" + "pmaddubsw %%xmm3,%%xmm6 \n" + "phaddw %%xmm2,%%xmm0 \n" + "phaddw %%xmm6,%%xmm1 \n" + "psraw $0x8,%%xmm0 \n" + "psraw $0x8,%%xmm1 \n" + "packsswb %%xmm1,%%xmm0 \n" + "paddb %%xmm5,%%xmm0 \n" + "sub $0x10,%3 \n" + "movlps %%xmm0," MEMACCESS(1) " \n" + BUNDLEALIGN + MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1) + "lea " MEMLEA(0x8,1) ",%1 \n" + "jg 1b \n" + : "+r"(src_rgba0), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+rm"(width) // %3 + : "r"((intptr_t)(src_stride_rgba)) // %4 + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" +#endif + ); +} +#endif // HAS_ARGBTOUVROW_SSSE3 + +#ifdef HAS_I422TOARGBROW_SSSE3 +#define UB 127 /* min(63,(int8)(2.018 * 64)) */ +#define UG -25 /* (int8)(-0.391 * 64 - 0.5) */ +#define UR 0 + +#define VB 0 +#define VG -52 /* (int8)(-0.813 * 64 - 0.5) */ +#define VR 102 /* (int8)(1.596 * 64 + 0.5) */ + +// Bias +#define BB UB * 128 + VB * 128 +#define BG UG * 128 + VG * 128 +#define BR UR * 128 + VR * 128 + +#define YG 74 /* (int8)(1.164 * 64 + 0.5) */ + +struct { + vec8 kUVToB; // 0 + vec8 kUVToG; // 16 + vec8 kUVToR; // 32 + vec16 kUVBiasB; // 48 + vec16 kUVBiasG; // 64 + vec16 kUVBiasR; // 80 + vec16 kYSub16; // 96 + vec16 kYToRgb; // 112 + vec8 kVUToB; // 128 + vec8 kVUToG; // 144 + vec8 kVUToR; // 160 +} static SIMD_ALIGNED(kYuvConstants) = { + { UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB }, + { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG }, + { UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR }, + { BB, BB, BB, BB, BB, BB, BB, BB }, + { BG, BG, BG, BG, BG, BG, BG, BG }, + { BR, BR, BR, BR, BR, BR, BR, BR }, + { 16, 16, 16, 16, 16, 16, 16, 16 }, + { YG, YG, YG, YG, YG, YG, YG, YG }, + { VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB }, + { VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG }, + { VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR } +}; + + +// Read 8 UV from 411 +#define READYUV444 \ + "movq " MEMACCESS([u_buf]) ",%%xmm0 \n" \ + BUNDLEALIGN \ + MEMOPREG(movq, 0x00, [u_buf], [v_buf], 1, xmm1) \ + "lea " MEMLEA(0x8, [u_buf]) ",%[u_buf] \n" \ + "punpcklbw %%xmm1,%%xmm0 \n" + +// Read 4 UV from 422, upsample to 8 UV +#define READYUV422 \ + "movd " MEMACCESS([u_buf]) ",%%xmm0 \n" \ + BUNDLEALIGN \ + MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1) \ + "lea " MEMLEA(0x4, [u_buf]) ",%[u_buf] \n" \ + "punpcklbw %%xmm1,%%xmm0 \n" \ + "punpcklwd %%xmm0,%%xmm0 \n" + +// Read 2 UV from 411, upsample to 8 UV +#define READYUV411 \ + "movd " MEMACCESS([u_buf]) ",%%xmm0 \n" \ + BUNDLEALIGN \ + MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1) \ + "lea " MEMLEA(0x2, [u_buf]) ",%[u_buf] \n" \ + "punpcklbw %%xmm1,%%xmm0 \n" \ + "punpcklwd %%xmm0,%%xmm0 \n" \ + "punpckldq %%xmm0,%%xmm0 \n" + +// Read 4 UV from NV12, upsample to 8 UV +#define READNV12 \ + "movq " MEMACCESS([uv_buf]) ",%%xmm0 \n" \ + "lea " MEMLEA(0x8, [uv_buf]) ",%[uv_buf] \n" \ + "punpcklwd %%xmm0,%%xmm0 \n" + +// Convert 8 pixels: 8 UV and 8 Y +#define YUVTORGB \ + "movdqa %%xmm0,%%xmm1 \n" \ + "movdqa %%xmm0,%%xmm2 \n" \ + "pmaddubsw " MEMACCESS([kYuvConstants]) ",%%xmm0 \n" \ + "pmaddubsw " MEMACCESS2(16, [kYuvConstants]) ",%%xmm1 \n" \ + "pmaddubsw " MEMACCESS2(32, [kYuvConstants]) ",%%xmm2 \n" \ + "psubw " MEMACCESS2(48, [kYuvConstants]) ",%%xmm0 \n" \ + "psubw " MEMACCESS2(64, [kYuvConstants]) ",%%xmm1 \n" \ + "psubw " MEMACCESS2(80, [kYuvConstants]) ",%%xmm2 \n" \ + "movq " MEMACCESS([y_buf]) ",%%xmm3 \n" \ + "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n" \ + "punpcklbw %%xmm4,%%xmm3 \n" \ + "psubsw " MEMACCESS2(96, [kYuvConstants]) ",%%xmm3 \n" \ + "pmullw " MEMACCESS2(112, [kYuvConstants]) ",%%xmm3 \n" \ + "paddsw %%xmm3,%%xmm0 \n" \ + "paddsw %%xmm3,%%xmm1 \n" \ + "paddsw %%xmm3,%%xmm2 \n" \ + "psraw $0x6,%%xmm0 \n" \ + "psraw $0x6,%%xmm1 \n" \ + "psraw $0x6,%%xmm2 \n" \ + "packuswb %%xmm0,%%xmm0 \n" \ + "packuswb %%xmm1,%%xmm1 \n" \ + "packuswb %%xmm2,%%xmm2 \n" + +// Convert 8 pixels: 8 VU and 8 Y +#define YVUTORGB \ + "movdqa %%xmm0,%%xmm1 \n" \ + "movdqa %%xmm0,%%xmm2 \n" \ + "pmaddubsw " MEMACCESS2(128, [kYuvConstants]) ",%%xmm0 \n" \ + "pmaddubsw " MEMACCESS2(144, [kYuvConstants]) ",%%xmm1 \n" \ + "pmaddubsw " MEMACCESS2(160, [kYuvConstants]) ",%%xmm2 \n" \ + "psubw " MEMACCESS2(48, [kYuvConstants]) ",%%xmm0 \n" \ + "psubw " MEMACCESS2(64, [kYuvConstants]) ",%%xmm1 \n" \ + "psubw " MEMACCESS2(80, [kYuvConstants]) ",%%xmm2 \n" \ + "movq " MEMACCESS([y_buf]) ",%%xmm3 \n" \ + "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n" \ + "punpcklbw %%xmm4,%%xmm3 \n" \ + "psubsw " MEMACCESS2(96, [kYuvConstants]) ",%%xmm3 \n" \ + "pmullw " MEMACCESS2(112, [kYuvConstants]) ",%%xmm3 \n" \ + "paddsw %%xmm3,%%xmm0 \n" \ + "paddsw %%xmm3,%%xmm1 \n" \ + "paddsw %%xmm3,%%xmm2 \n" \ + "psraw $0x6,%%xmm0 \n" \ + "psraw $0x6,%%xmm1 \n" \ + "psraw $0x6,%%xmm2 \n" \ + "packuswb %%xmm0,%%xmm0 \n" \ + "packuswb %%xmm1,%%xmm1 \n" \ + "packuswb %%xmm2,%%xmm2 \n" + +void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* dst_argb, + int width) { + asm volatile ( + "sub %[u_buf],%[v_buf] \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + "pxor %%xmm4,%%xmm4 \n" + LABELALIGN + "1: \n" + READYUV444 + YUVTORGB + "punpcklbw %%xmm1,%%xmm0 \n" + "punpcklbw %%xmm5,%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklwd %%xmm2,%%xmm0 \n" + "punpckhwd %%xmm2,%%xmm1 \n" + "movdqa %%xmm0," MEMACCESS([dst_argb]) " \n" + "movdqa %%xmm1," MEMACCESS2(0x10,[dst_argb]) " \n" + "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n" + "sub $0x8,%[width] \n" + "jg 1b \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [dst_argb]"+r"(dst_argb), // %[dst_argb] + [width]"+rm"(width) // %[width] + : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" +#endif + ); +} + +void OMITFP I422ToRGB24Row_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* dst_rgb24, + int width) { +// fpic 32 bit gcc 4.2 on OSX runs out of GPR regs. +#if defined(__i386__) + asm volatile ( + "movdqa %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n" + "movdqa %[kShuffleMaskARGBToRGB24],%%xmm6 \n" + :: [kShuffleMaskARGBToRGB24_0]"m"(kShuffleMaskARGBToRGB24_0), + [kShuffleMaskARGBToRGB24]"m"(kShuffleMaskARGBToRGB24)); +#endif + + asm volatile ( +#if !defined(__i386__) + "movdqa %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n" + "movdqa %[kShuffleMaskARGBToRGB24],%%xmm6 \n" +#endif + "sub %[u_buf],%[v_buf] \n" + "pxor %%xmm4,%%xmm4 \n" + LABELALIGN + "1: \n" + READYUV422 + YUVTORGB + "punpcklbw %%xmm1,%%xmm0 \n" + "punpcklbw %%xmm2,%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklwd %%xmm2,%%xmm0 \n" + "punpckhwd %%xmm2,%%xmm1 \n" + "pshufb %%xmm5,%%xmm0 \n" + "pshufb %%xmm6,%%xmm1 \n" + "palignr $0xc,%%xmm0,%%xmm1 \n" + "movq %%xmm0," MEMACCESS([dst_rgb24]) "\n" + "movdqu %%xmm1," MEMACCESS2(0x8,[dst_rgb24]) "\n" + "lea " MEMLEA(0x18,[dst_rgb24]) ",%[dst_rgb24] \n" + "sub $0x8,%[width] \n" + "jg 1b \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [dst_rgb24]"+r"(dst_rgb24), // %[dst_rgb24] + [width]"+rm"(width) // %[width] + : [kYuvConstants]"r"(&kYuvConstants.kUVToB) +#if !defined(__i386__) + , [kShuffleMaskARGBToRGB24_0]"m"(kShuffleMaskARGBToRGB24_0), + [kShuffleMaskARGBToRGB24]"m"(kShuffleMaskARGBToRGB24) +#endif + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" +#endif + ); +} + +void OMITFP I422ToRAWRow_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* dst_raw, + int width) { +// fpic 32 bit gcc 4.2 on OSX runs out of GPR regs. +#if defined(__i386__) + asm volatile ( + "movdqa %[kShuffleMaskARGBToRAW_0],%%xmm5 \n" + "movdqa %[kShuffleMaskARGBToRAW],%%xmm6 \n" + :: [kShuffleMaskARGBToRAW_0]"m"(kShuffleMaskARGBToRAW_0), + [kShuffleMaskARGBToRAW]"m"(kShuffleMaskARGBToRAW)); +#endif + + asm volatile ( +#if !defined(__i386__) + "movdqa %[kShuffleMaskARGBToRAW_0],%%xmm5 \n" + "movdqa %[kShuffleMaskARGBToRAW],%%xmm6 \n" +#endif + "sub %[u_buf],%[v_buf] \n" + "pxor %%xmm4,%%xmm4 \n" + LABELALIGN + "1: \n" + READYUV422 + YUVTORGB + "punpcklbw %%xmm1,%%xmm0 \n" + "punpcklbw %%xmm2,%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklwd %%xmm2,%%xmm0 \n" + "punpckhwd %%xmm2,%%xmm1 \n" + "pshufb %%xmm5,%%xmm0 \n" + "pshufb %%xmm6,%%xmm1 \n" + "palignr $0xc,%%xmm0,%%xmm1 \n" + "movq %%xmm0," MEMACCESS([dst_raw]) " \n" + "movdqu %%xmm1," MEMACCESS2(0x8,[dst_raw]) "\n" + "lea " MEMLEA(0x18,[dst_raw]) ",%[dst_raw] \n" + "sub $0x8,%[width] \n" + "jg 1b \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [dst_raw]"+r"(dst_raw), // %[dst_raw] + [width]"+rm"(width) // %[width] + : [kYuvConstants]"r"(&kYuvConstants.kUVToB) +#if !defined(__i386__) + , [kShuffleMaskARGBToRAW_0]"m"(kShuffleMaskARGBToRAW_0), + [kShuffleMaskARGBToRAW]"m"(kShuffleMaskARGBToRAW) +#endif + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" +#endif + ); +} + +void OMITFP I422ToARGBRow_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* dst_argb, + int width) { + asm volatile ( + "sub %[u_buf],%[v_buf] \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + "pxor %%xmm4,%%xmm4 \n" + LABELALIGN + "1: \n" + READYUV422 + YUVTORGB + "punpcklbw %%xmm1,%%xmm0 \n" + "punpcklbw %%xmm5,%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklwd %%xmm2,%%xmm0 \n" + "punpckhwd %%xmm2,%%xmm1 \n" + "movdqa %%xmm0," MEMACCESS([dst_argb]) "\n" + "movdqa %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n" + "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n" + "sub $0x8,%[width] \n" + "jg 1b \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [dst_argb]"+r"(dst_argb), // %[dst_argb] + [width]"+rm"(width) // %[width] + : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" +#endif + ); +} + +void OMITFP I411ToARGBRow_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* dst_argb, + int width) { + asm volatile ( + "sub %[u_buf],%[v_buf] \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + "pxor %%xmm4,%%xmm4 \n" + LABELALIGN + "1: \n" + READYUV411 + YUVTORGB + "punpcklbw %%xmm1,%%xmm0 \n" + "punpcklbw %%xmm5,%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklwd %%xmm2,%%xmm0 \n" + "punpckhwd %%xmm2,%%xmm1 \n" + "movdqa %%xmm0," MEMACCESS([dst_argb]) "\n" + "movdqa %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n" + "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n" + "sub $0x8,%[width] \n" + "jg 1b \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [dst_argb]"+r"(dst_argb), // %[dst_argb] + [width]"+rm"(width) // %[width] + : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" +#endif + ); +} + +void OMITFP NV12ToARGBRow_SSSE3(const uint8* y_buf, + const uint8* uv_buf, + uint8* dst_argb, + int width) { + asm volatile ( + "pcmpeqb %%xmm5,%%xmm5 \n" + "pxor %%xmm4,%%xmm4 \n" + LABELALIGN + "1: \n" + READNV12 + YUVTORGB + "punpcklbw %%xmm1,%%xmm0 \n" + "punpcklbw %%xmm5,%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklwd %%xmm2,%%xmm0 \n" + "punpckhwd %%xmm2,%%xmm1 \n" + "movdqa %%xmm0," MEMACCESS([dst_argb]) "\n" + "movdqa %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n" + "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n" + "sub $0x8,%[width] \n" + "jg 1b \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [uv_buf]"+r"(uv_buf), // %[uv_buf] + [dst_argb]"+r"(dst_argb), // %[dst_argb] + [width]"+rm"(width) // %[width] + : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] + : "memory", "cc" + // Does not use r14. +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" +#endif + ); +} + +void OMITFP NV21ToARGBRow_SSSE3(const uint8* y_buf, + const uint8* uv_buf, + uint8* dst_argb, + int width) { + asm volatile ( + "pcmpeqb %%xmm5,%%xmm5 \n" + "pxor %%xmm4,%%xmm4 \n" + LABELALIGN + "1: \n" + READNV12 + YVUTORGB + "punpcklbw %%xmm1,%%xmm0 \n" + "punpcklbw %%xmm5,%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklwd %%xmm2,%%xmm0 \n" + "punpckhwd %%xmm2,%%xmm1 \n" + "movdqa %%xmm0," MEMACCESS([dst_argb]) "\n" + "movdqa %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n" + "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n" + "sub $0x8,%[width] \n" + "jg 1b \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [uv_buf]"+r"(uv_buf), // %[uv_buf] + [dst_argb]"+r"(dst_argb), // %[dst_argb] + [width]"+rm"(width) // %[width] + : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] + : "memory", "cc" + // Does not use r14. +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" +#endif + ); +} + +void OMITFP I444ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* dst_argb, + int width) { + asm volatile ( + "sub %[u_buf],%[v_buf] \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + "pxor %%xmm4,%%xmm4 \n" + LABELALIGN + "1: \n" + READYUV444 + YUVTORGB + "punpcklbw %%xmm1,%%xmm0 \n" + "punpcklbw %%xmm5,%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklwd %%xmm2,%%xmm0 \n" + "punpckhwd %%xmm2,%%xmm1 \n" + "movdqu %%xmm0," MEMACCESS([dst_argb]) "\n" + "movdqu %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n" + "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n" + "sub $0x8,%[width] \n" + "jg 1b \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [dst_argb]"+r"(dst_argb), // %[dst_argb] + [width]"+rm"(width) // %[width] + : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" +#endif + ); +} + +void OMITFP I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* dst_argb, + int width) { + asm volatile ( + "sub %[u_buf],%[v_buf] \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + "pxor %%xmm4,%%xmm4 \n" + LABELALIGN + "1: \n" + READYUV422 + YUVTORGB + "punpcklbw %%xmm1,%%xmm0 \n" + "punpcklbw %%xmm5,%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklwd %%xmm2,%%xmm0 \n" + "punpckhwd %%xmm2,%%xmm1 \n" + "movdqu %%xmm0," MEMACCESS([dst_argb]) "\n" + "movdqu %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n" + "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n" + "sub $0x8,%[width] \n" + "jg 1b \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [dst_argb]"+r"(dst_argb), // %[dst_argb] + [width]"+rm"(width) // %[width] + : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" +#endif + ); +} + +void OMITFP I411ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* dst_argb, + int width) { + asm volatile ( + "sub %[u_buf],%[v_buf] \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + "pxor %%xmm4,%%xmm4 \n" + LABELALIGN + "1: \n" + READYUV411 + YUVTORGB + "punpcklbw %%xmm1,%%xmm0 \n" + "punpcklbw %%xmm5,%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklwd %%xmm2,%%xmm0 \n" + "punpckhwd %%xmm2,%%xmm1 \n" + "movdqu %%xmm0," MEMACCESS([dst_argb]) "\n" + "movdqu %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n" + "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n" + "sub $0x8,%[width] \n" + "jg 1b \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [dst_argb]"+r"(dst_argb), // %[dst_argb] + [width]"+rm"(width) // %[width] + : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" +#endif + ); +} + +void OMITFP NV12ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, + const uint8* uv_buf, + uint8* dst_argb, + int width) { + asm volatile ( + "pcmpeqb %%xmm5,%%xmm5 \n" + "pxor %%xmm4,%%xmm4 \n" + LABELALIGN + "1: \n" + READNV12 + YUVTORGB + "punpcklbw %%xmm1,%%xmm0 \n" + "punpcklbw %%xmm5,%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklwd %%xmm2,%%xmm0 \n" + "punpckhwd %%xmm2,%%xmm1 \n" + "movdqu %%xmm0," MEMACCESS([dst_argb]) "\n" + "movdqu %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n" + "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n" + "sub $0x8,%[width] \n" + "jg 1b \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [uv_buf]"+r"(uv_buf), // %[uv_buf] + [dst_argb]"+r"(dst_argb), // %[dst_argb] + [width]"+rm"(width) // %[width] + : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] + : "memory", "cc" + // Does not use r14. +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" +#endif + ); +} + +void OMITFP NV21ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, + const uint8* uv_buf, + uint8* dst_argb, + int width) { + asm volatile ( + "pcmpeqb %%xmm5,%%xmm5 \n" + "pxor %%xmm4,%%xmm4 \n" + LABELALIGN + "1: \n" + READNV12 + YVUTORGB + "punpcklbw %%xmm1,%%xmm0 \n" + "punpcklbw %%xmm5,%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklwd %%xmm2,%%xmm0 \n" + "punpckhwd %%xmm2,%%xmm1 \n" + "movdqu %%xmm0," MEMACCESS([dst_argb]) "\n" + "movdqu %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n" + "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n" + "sub $0x8,%[width] \n" + "jg 1b \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [uv_buf]"+r"(uv_buf), // %[uv_buf] + [dst_argb]"+r"(dst_argb), // %[dst_argb] + [width]"+rm"(width) // %[width] + : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] + : "memory", "cc" + // Does not use r14. +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" +#endif + ); +} + +void OMITFP I422ToBGRARow_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* dst_bgra, + int width) { + asm volatile ( + "sub %[u_buf],%[v_buf] \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + "pxor %%xmm4,%%xmm4 \n" + LABELALIGN + "1: \n" + READYUV422 + YUVTORGB + "pcmpeqb %%xmm5,%%xmm5 \n" + "punpcklbw %%xmm0,%%xmm1 \n" + "punpcklbw %%xmm2,%%xmm5 \n" + "movdqa %%xmm5,%%xmm0 \n" + "punpcklwd %%xmm1,%%xmm5 \n" + "punpckhwd %%xmm1,%%xmm0 \n" + "movdqa %%xmm5," MEMACCESS([dst_bgra]) "\n" + "movdqa %%xmm0," MEMACCESS2(0x10,[dst_bgra]) "\n" + "lea " MEMLEA(0x20,[dst_bgra]) ",%[dst_bgra] \n" + "sub $0x8,%[width] \n" + "jg 1b \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [dst_bgra]"+r"(dst_bgra), // %[dst_bgra] + [width]"+rm"(width) // %[width] + : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" +#endif + ); +} + +void OMITFP I422ToABGRRow_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* dst_abgr, + int width) { + asm volatile ( + "sub %[u_buf],%[v_buf] \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + "pxor %%xmm4,%%xmm4 \n" + LABELALIGN + "1: \n" + READYUV422 + YUVTORGB + "punpcklbw %%xmm1,%%xmm2 \n" + "punpcklbw %%xmm5,%%xmm0 \n" + "movdqa %%xmm2,%%xmm1 \n" + "punpcklwd %%xmm0,%%xmm2 \n" + "punpckhwd %%xmm0,%%xmm1 \n" + "movdqa %%xmm2," MEMACCESS([dst_abgr]) "\n" + "movdqa %%xmm1," MEMACCESS2(0x10,[dst_abgr]) "\n" + "lea " MEMLEA(0x20,[dst_abgr]) ",%[dst_abgr] \n" + "sub $0x8,%[width] \n" + "jg 1b \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [dst_abgr]"+r"(dst_abgr), // %[dst_abgr] + [width]"+rm"(width) // %[width] + : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" +#endif + ); +} + +void OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* dst_rgba, + int width) { + asm volatile ( + "sub %[u_buf],%[v_buf] \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + "pxor %%xmm4,%%xmm4 \n" + LABELALIGN + "1: \n" + READYUV422 + YUVTORGB + "pcmpeqb %%xmm5,%%xmm5 \n" + "punpcklbw %%xmm2,%%xmm1 \n" + "punpcklbw %%xmm0,%%xmm5 \n" + "movdqa %%xmm5,%%xmm0 \n" + "punpcklwd %%xmm1,%%xmm5 \n" + "punpckhwd %%xmm1,%%xmm0 \n" + "movdqa %%xmm5," MEMACCESS([dst_rgba]) "\n" + "movdqa %%xmm0," MEMACCESS2(0x10,[dst_rgba]) "\n" + "lea " MEMLEA(0x20,[dst_rgba]) ",%[dst_rgba] \n" + "sub $0x8,%[width] \n" + "jg 1b \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [dst_rgba]"+r"(dst_rgba), // %[dst_rgba] + [width]"+rm"(width) // %[width] + : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" +#endif + ); +} + +void OMITFP I422ToBGRARow_Unaligned_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* dst_bgra, + int width) { + asm volatile ( + "sub %[u_buf],%[v_buf] \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + "pxor %%xmm4,%%xmm4 \n" + LABELALIGN + "1: \n" + READYUV422 + YUVTORGB + "pcmpeqb %%xmm5,%%xmm5 \n" + "punpcklbw %%xmm0,%%xmm1 \n" + "punpcklbw %%xmm2,%%xmm5 \n" + "movdqa %%xmm5,%%xmm0 \n" + "punpcklwd %%xmm1,%%xmm5 \n" + "punpckhwd %%xmm1,%%xmm0 \n" + "movdqu %%xmm5," MEMACCESS([dst_bgra]) "\n" + "movdqu %%xmm0," MEMACCESS2(0x10,[dst_bgra]) "\n" + "lea " MEMLEA(0x20,[dst_bgra]) ",%[dst_bgra] \n" + "sub $0x8,%[width] \n" + "jg 1b \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [dst_bgra]"+r"(dst_bgra), // %[dst_bgra] + [width]"+rm"(width) // %[width] + : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" +#endif + ); +} + +void OMITFP I422ToABGRRow_Unaligned_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* dst_abgr, + int width) { + asm volatile ( + "sub %[u_buf],%[v_buf] \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + "pxor %%xmm4,%%xmm4 \n" + LABELALIGN + "1: \n" + READYUV422 + YUVTORGB + "punpcklbw %%xmm1,%%xmm2 \n" + "punpcklbw %%xmm5,%%xmm0 \n" + "movdqa %%xmm2,%%xmm1 \n" + "punpcklwd %%xmm0,%%xmm2 \n" + "punpckhwd %%xmm0,%%xmm1 \n" + "movdqu %%xmm2," MEMACCESS([dst_abgr]) "\n" + "movdqu %%xmm1," MEMACCESS2(0x10,[dst_abgr]) "\n" + "lea " MEMLEA(0x20,[dst_abgr]) ",%[dst_abgr] \n" + "sub $0x8,%[width] \n" + "jg 1b \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [dst_abgr]"+r"(dst_abgr), // %[dst_abgr] + [width]"+rm"(width) // %[width] + : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" +#endif + ); +} + +void OMITFP I422ToRGBARow_Unaligned_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* dst_rgba, + int width) { + asm volatile ( + "sub %[u_buf],%[v_buf] \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + "pxor %%xmm4,%%xmm4 \n" + LABELALIGN + "1: \n" + READYUV422 + YUVTORGB + "pcmpeqb %%xmm5,%%xmm5 \n" + "punpcklbw %%xmm2,%%xmm1 \n" + "punpcklbw %%xmm0,%%xmm5 \n" + "movdqa %%xmm5,%%xmm0 \n" + "punpcklwd %%xmm1,%%xmm5 \n" + "punpckhwd %%xmm1,%%xmm0 \n" + "movdqu %%xmm5," MEMACCESS([dst_rgba]) "\n" + "movdqu %%xmm0," MEMACCESS2(0x10,[dst_rgba]) "\n" + "lea " MEMLEA(0x20,[dst_rgba]) ",%[dst_rgba] \n" + "sub $0x8,%[width] \n" + "jg 1b \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [dst_rgba]"+r"(dst_rgba), // %[dst_rgba] + [width]"+rm"(width) // %[width] + : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" +#endif + ); +} + +#endif // HAS_I422TOARGBROW_SSSE3 + +#ifdef HAS_YTOARGBROW_SSE2 +void YToARGBRow_SSE2(const uint8* y_buf, + uint8* dst_argb, + int width) { + asm volatile ( + "pxor %%xmm5,%%xmm5 \n" + "pcmpeqb %%xmm4,%%xmm4 \n" + "pslld $0x18,%%xmm4 \n" + "mov $0x00100010,%%eax \n" + "movd %%eax,%%xmm3 \n" + "pshufd $0x0,%%xmm3,%%xmm3 \n" + "mov $0x004a004a,%%eax \n" + "movd %%eax,%%xmm2 \n" + "pshufd $0x0,%%xmm2,%%xmm2 \n" + LABELALIGN + "1: \n" + // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164 + "movq " MEMACCESS(0) ",%%xmm0 \n" + "lea " MEMLEA(0x8,0) ",%0 \n" + "punpcklbw %%xmm5,%%xmm0 \n" + "psubusw %%xmm3,%%xmm0 \n" + "pmullw %%xmm2,%%xmm0 \n" + "psrlw $6, %%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + + // Step 2: Weave into ARGB + "punpcklbw %%xmm0,%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklwd %%xmm0,%%xmm0 \n" + "punpckhwd %%xmm1,%%xmm1 \n" + "por %%xmm4,%%xmm0 \n" + "por %%xmm4,%%xmm1 \n" + "movdqa %%xmm0," MEMACCESS(1) " \n" + "movdqa %%xmm1," MEMACCESS2(0x10,1) " \n" + "lea " MEMLEA(0x20,1) ",%1 \n" + + "sub $0x8,%2 \n" + "jg 1b \n" + : "+r"(y_buf), // %0 + "+r"(dst_argb), // %1 + "+rm"(width) // %2 + : + : "memory", "cc", "eax" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4" +#endif + ); +} +#endif // HAS_YTOARGBROW_SSE2 + +#ifdef HAS_MIRRORROW_SSSE3 +// Shuffle table for reversing the bytes. +static uvec8 kShuffleMirror = { + 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u +}; + +void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) { + intptr_t temp_width = (intptr_t)(width); + asm volatile ( + "movdqa %3,%%xmm5 \n" + "lea " MEMLEA(-0x10,0) ",%0 \n" + LABELALIGN + "1: \n" + MEMOPREG(movdqa,0x00,0,2,1,xmm0) // movdqa (%0,%2),%%xmm0 + "pshufb %%xmm5,%%xmm0 \n" + "sub $0x10,%2 \n" + "movdqa %%xmm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x10,1) ",%1 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(temp_width) // %2 + : "m"(kShuffleMirror) // %3 + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm5" +#endif + ); +} +#endif // HAS_MIRRORROW_SSSE3 + +#ifdef HAS_MIRRORROW_SSE2 +void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) { + intptr_t temp_width = (intptr_t)(width); + asm volatile ( + "lea " MEMLEA(-0x10,0) ",%0 \n" + LABELALIGN + "1: \n" + MEMOPREG(movdqu,0x00,0,2,1,xmm0) // movdqu (%0,%2),%%xmm0 + "movdqa %%xmm0,%%xmm1 \n" + "psllw $0x8,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "por %%xmm1,%%xmm0 \n" + "pshuflw $0x1b,%%xmm0,%%xmm0 \n" + "pshufhw $0x1b,%%xmm0,%%xmm0 \n" + "pshufd $0x4e,%%xmm0,%%xmm0 \n" + "sub $0x10,%2 \n" + "movdqu %%xmm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x10,1)",%1 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(temp_width) // %2 + : + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1" +#endif + ); +} +#endif // HAS_MIRRORROW_SSE2 + +#ifdef HAS_MIRRORROW_UV_SSSE3 +// Shuffle table for reversing the bytes of UV channels. +static uvec8 kShuffleMirrorUV = { + 14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u +}; +void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v, + int width) { + intptr_t temp_width = (intptr_t)(width); + asm volatile ( + "movdqa %4,%%xmm1 \n" + "lea " MEMLEA4(-0x10,0,3,2) ",%0 \n" + "sub %1,%2 \n" + LABELALIGN + "1: \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "lea " MEMLEA(-0x10,0) ",%0 \n" + "pshufb %%xmm1,%%xmm0 \n" + "sub $8,%3 \n" + "movlpd %%xmm0," MEMACCESS(1) " \n" + BUNDLEALIGN + MEMOPMEM(movhpd,xmm0,0x00,1,2,1) // movhpd %%xmm0,(%1,%2) + "lea " MEMLEA(0x8,1) ",%1 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(temp_width) // %3 + : "m"(kShuffleMirrorUV) // %4 + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1" +#endif + ); +} +#endif // HAS_MIRRORROW_UV_SSSE3 + +#ifdef HAS_ARGBMIRRORROW_SSSE3 +// Shuffle table for reversing the bytes. +static uvec8 kARGBShuffleMirror = { + 12u, 13u, 14u, 15u, 8u, 9u, 10u, 11u, 4u, 5u, 6u, 7u, 0u, 1u, 2u, 3u +}; + +void ARGBMirrorRow_SSSE3(const uint8* src, uint8* dst, int width) { + intptr_t temp_width = (intptr_t)(width); + asm volatile ( + "lea " MEMLEA4(-0x10,0,2,4) ",%0 \n" + "movdqa %3,%%xmm5 \n" + LABELALIGN + "1: \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "pshufb %%xmm5,%%xmm0 \n" + "lea " MEMLEA(-0x10,0) ",%0 \n" + "sub $0x4,%2 \n" + "movdqa %%xmm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x10,1) ",%1 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(temp_width) // %2 + : "m"(kARGBShuffleMirror) // %3 + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm5" +#endif + ); +} +#endif // HAS_ARGBMIRRORROW_SSSE3 + +#ifdef HAS_SPLITUVROW_SSE2 +void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) { + asm volatile ( + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrlw $0x8,%%xmm5 \n" + "sub %1,%2 \n" + LABELALIGN + "1: \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "lea " MEMLEA(0x20,0) ",%0 \n" + "movdqa %%xmm0,%%xmm2 \n" + "movdqa %%xmm1,%%xmm3 \n" + "pand %%xmm5,%%xmm0 \n" + "pand %%xmm5,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "psrlw $0x8,%%xmm2 \n" + "psrlw $0x8,%%xmm3 \n" + "packuswb %%xmm3,%%xmm2 \n" + "movdqa %%xmm0," MEMACCESS(1) " \n" + MEMOPMEM(movdqa,xmm2,0x00,1,2,1) // movdqa %%xmm2,(%1,%2) + "lea " MEMLEA(0x10,1) ",%1 \n" + "sub $0x10,%3 \n" + "jg 1b \n" + : "+r"(src_uv), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(pix) // %3 + : + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" +#endif + ); +} + +void SplitUVRow_Unaligned_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, + int pix) { + asm volatile ( + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrlw $0x8,%%xmm5 \n" + "sub %1,%2 \n" + LABELALIGN + "1: \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "lea " MEMLEA(0x20,0) ",%0 \n" + "movdqa %%xmm0,%%xmm2 \n" + "movdqa %%xmm1,%%xmm3 \n" + "pand %%xmm5,%%xmm0 \n" + "pand %%xmm5,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "psrlw $0x8,%%xmm2 \n" + "psrlw $0x8,%%xmm3 \n" + "packuswb %%xmm3,%%xmm2 \n" + "movdqu %%xmm0," MEMACCESS(1) " \n" + MEMOPMEM(movdqu,xmm2,0x00,1,2,1) // movdqu %%xmm2,(%1,%2) + "lea " MEMLEA(0x10,1) ",%1 \n" + "sub $0x10,%3 \n" + "jg 1b \n" + : "+r"(src_uv), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(pix) // %3 + : + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" +#endif + ); +} +#endif // HAS_SPLITUVROW_SSE2 + +#ifdef HAS_MERGEUVROW_SSE2 +void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, + int width) { + asm volatile ( + "sub %0,%1 \n" + LABELALIGN + "1: \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + MEMOPREG(movdqa,0x00,0,1,1,xmm1) // movdqa (%0,%1,1),%%xmm1 + "lea " MEMLEA(0x10,0) ",%0 \n" + "movdqa %%xmm0,%%xmm2 \n" + "punpcklbw %%xmm1,%%xmm0 \n" + "punpckhbw %%xmm1,%%xmm2 \n" + "movdqa %%xmm0," MEMACCESS(2) " \n" + "movdqa %%xmm2," MEMACCESS2(0x10,2) " \n" + "lea " MEMLEA(0x20,2) ",%2 \n" + "sub $0x10,%3 \n" + "jg 1b \n" + : "+r"(src_u), // %0 + "+r"(src_v), // %1 + "+r"(dst_uv), // %2 + "+r"(width) // %3 + : + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2" +#endif + ); +} + +void MergeUVRow_Unaligned_SSE2(const uint8* src_u, const uint8* src_v, + uint8* dst_uv, int width) { + asm volatile ( + "sub %0,%1 \n" + LABELALIGN + "1: \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqu (%0,%1,1),%%xmm1 + "lea " MEMLEA(0x10,0) ",%0 \n" + "movdqa %%xmm0,%%xmm2 \n" + "punpcklbw %%xmm1,%%xmm0 \n" + "punpckhbw %%xmm1,%%xmm2 \n" + "movdqu %%xmm0," MEMACCESS(2) " \n" + "movdqu %%xmm2," MEMACCESS2(0x10,2) " \n" + "lea " MEMLEA(0x20,2) ",%2 \n" + "sub $0x10,%3 \n" + "jg 1b \n" + : "+r"(src_u), // %0 + "+r"(src_v), // %1 + "+r"(dst_uv), // %2 + "+r"(width) // %3 + : + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2" +#endif + ); +} +#endif // HAS_MERGEUVROW_SSE2 + +#ifdef HAS_COPYROW_SSE2 +void CopyRow_SSE2(const uint8* src, uint8* dst, int count) { + asm volatile ( + LABELALIGN + "1: \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "lea " MEMLEA(0x20,0) ",%0 \n" + "movdqa %%xmm0," MEMACCESS(1) " \n" + "movdqa %%xmm1," MEMACCESS2(0x10,1) " \n" + "lea " MEMLEA(0x20,1) ",%1 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(count) // %2 + : + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1" +#endif + ); +} +#endif // HAS_COPYROW_SSE2 + +#ifdef HAS_COPYROW_X86 +void CopyRow_X86(const uint8* src, uint8* dst, int width) { + size_t width_tmp = (size_t)(width); + asm volatile ( + "shr $0x2,%2 \n" + "rep movsl " MEMMOVESTRING(0,1) " \n" + : "+S"(src), // %0 + "+D"(dst), // %1 + "+c"(width_tmp) // %2 + : + : "memory", "cc" + ); +} +#endif // HAS_COPYROW_X86 + +#ifdef HAS_COPYROW_ERMS +// Unaligned Multiple of 1. +void CopyRow_ERMS(const uint8* src, uint8* dst, int width) { + size_t width_tmp = (size_t)(width); + asm volatile ( + "rep movsb " MEMMOVESTRING(0,1) " \n" + : "+S"(src), // %0 + "+D"(dst), // %1 + "+c"(width_tmp) // %2 + : + : "memory", "cc" + ); +} +#endif // HAS_COPYROW_ERMS + +#ifdef HAS_ARGBCOPYALPHAROW_SSE2 +// width in pixels +void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) { + asm volatile ( + "pcmpeqb %%xmm0,%%xmm0 \n" + "pslld $0x18,%%xmm0 \n" + "pcmpeqb %%xmm1,%%xmm1 \n" + "psrld $0x8,%%xmm1 \n" + LABELALIGN + "1: \n" + "movdqa " MEMACCESS(0) ",%%xmm2 \n" + "movdqa " MEMACCESS2(0x10,0) ",%%xmm3 \n" + "lea " MEMLEA(0x20,0) ",%0 \n" + "movdqa " MEMACCESS(1) ",%%xmm4 \n" + "movdqa " MEMACCESS2(0x10,1) ",%%xmm5 \n" + "pand %%xmm0,%%xmm2 \n" + "pand %%xmm0,%%xmm3 \n" + "pand %%xmm1,%%xmm4 \n" + "pand %%xmm1,%%xmm5 \n" + "por %%xmm4,%%xmm2 \n" + "por %%xmm5,%%xmm3 \n" + "movdqa %%xmm2," MEMACCESS(1) " \n" + "movdqa %%xmm3," MEMACCESS2(0x10,1) " \n" + "lea " MEMLEA(0x20,1) ",%1 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" +#endif + ); +} +#endif // HAS_ARGBCOPYALPHAROW_SSE2 + +#ifdef HAS_ARGBCOPYALPHAROW_AVX2 +// width in pixels +void ARGBCopyAlphaRow_AVX2(const uint8* src, uint8* dst, int width) { + asm volatile ( + "vpcmpeqb %%ymm0,%%ymm0,%%ymm0 \n" + "vpsrld $0x8,%%ymm0,%%ymm0 \n" + LABELALIGN + "1: \n" + "vmovdqu " MEMACCESS(0) ",%%ymm1 \n" + "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm2 \n" + "lea " MEMLEA(0x40,0) ",%0 \n" + "vpblendvb %%ymm0," MEMACCESS(1) ",%%ymm1,%%ymm1 \n" + "vpblendvb %%ymm0," MEMACCESS2(0x20,1) ",%%ymm2,%%ymm2 \n" + "vmovdqu %%ymm1," MEMACCESS(1) " \n" + "vmovdqu %%ymm2," MEMACCESS2(0x20,1) " \n" + "lea " MEMLEA(0x40,1) ",%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2" +#endif + ); +} +#endif // HAS_ARGBCOPYALPHAROW_AVX2 + +#ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2 +// width in pixels +void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) { + asm volatile ( + "pcmpeqb %%xmm0,%%xmm0 \n" + "pslld $0x18,%%xmm0 \n" + "pcmpeqb %%xmm1,%%xmm1 \n" + "psrld $0x8,%%xmm1 \n" + LABELALIGN + "1: \n" + "movq " MEMACCESS(0) ",%%xmm2 \n" + "lea " MEMLEA(0x8,0) ",%0 \n" + "punpcklbw %%xmm2,%%xmm2 \n" + "punpckhwd %%xmm2,%%xmm3 \n" + "punpcklwd %%xmm2,%%xmm2 \n" + "movdqa " MEMACCESS(1) ",%%xmm4 \n" + "movdqa " MEMACCESS2(0x10,1) ",%%xmm5 \n" + "pand %%xmm0,%%xmm2 \n" + "pand %%xmm0,%%xmm3 \n" + "pand %%xmm1,%%xmm4 \n" + "pand %%xmm1,%%xmm5 \n" + "por %%xmm4,%%xmm2 \n" + "por %%xmm5,%%xmm3 \n" + "movdqa %%xmm2," MEMACCESS(1) " \n" + "movdqa %%xmm3," MEMACCESS2(0x10,1) " \n" + "lea " MEMLEA(0x20,1) ",%1 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" +#endif + ); +} +#endif // HAS_ARGBCOPYYTOALPHAROW_SSE2 + +#ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2 +// width in pixels +void ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) { + asm volatile ( + "vpcmpeqb %%ymm0,%%ymm0,%%ymm0 \n" + "vpsrld $0x8,%%ymm0,%%ymm0 \n" + LABELALIGN + "1: \n" + "vpmovzxbd " MEMACCESS(0) ",%%ymm1 \n" + "vpmovzxbd " MEMACCESS2(0x8,0) ",%%ymm2 \n" + "lea " MEMLEA(0x10,0) ",%0 \n" + "vpslld $0x18,%%ymm1,%%ymm1 \n" + "vpslld $0x18,%%ymm2,%%ymm2 \n" + "vpblendvb %%ymm0," MEMACCESS(1) ",%%ymm1,%%ymm1 \n" + "vpblendvb %%ymm0," MEMACCESS2(0x20,1) ",%%ymm2,%%ymm2 \n" + "vmovdqu %%ymm1," MEMACCESS(1) " \n" + "vmovdqu %%ymm2," MEMACCESS2(0x20,1) " \n" + "lea " MEMLEA(0x40,1) ",%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2" +#endif + ); +} +#endif // HAS_ARGBCOPYYTOALPHAROW_AVX2 + +#ifdef HAS_SETROW_X86 +void SetRow_X86(uint8* dst, uint32 v32, int width) { + size_t width_tmp = (size_t)(width); + asm volatile ( + "shr $0x2,%1 \n" + "rep stosl " MEMSTORESTRING(eax,0) " \n" + : "+D"(dst), // %0 + "+c"(width_tmp) // %1 + : "a"(v32) // %2 + : "memory", "cc"); +} + +void ARGBSetRows_X86(uint8* dst, uint32 v32, int width, + int dst_stride, int height) { + for (int y = 0; y < height; ++y) { + size_t width_tmp = (size_t)(width); + uint32* d = (uint32*)(dst); + asm volatile ( + "rep stosl " MEMSTORESTRING(eax,0) " \n" + : "+D"(d), // %0 + "+c"(width_tmp) // %1 + : "a"(v32) // %2 + : "memory", "cc"); + dst += dst_stride; + } +} +#endif // HAS_SETROW_X86 + +#ifdef HAS_YUY2TOYROW_SSE2 +void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix) { + asm volatile ( + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrlw $0x8,%%xmm5 \n" + LABELALIGN + "1: \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "lea " MEMLEA(0x20,0) ",%0 \n" + "pand %%xmm5,%%xmm0 \n" + "pand %%xmm5,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqa %%xmm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x10,1) ",%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src_yuy2), // %0 + "+r"(dst_y), // %1 + "+r"(pix) // %2 + : + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm5" +#endif + ); +} + +void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2, + uint8* dst_u, uint8* dst_v, int pix) { + asm volatile ( + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrlw $0x8,%%xmm5 \n" + "sub %1,%2 \n" + LABELALIGN + "1: \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" + BUNDLEALIGN + MEMOPREG(movdqa,0x00,0,4,1,xmm2) // movdqa (%0,%4,1),%%xmm2 + MEMOPREG(movdqa,0x10,0,4,1,xmm3) // movdqa 0x10(%0,%4,1),%%xmm3 + "lea " MEMLEA(0x20,0) ",%0 \n" + "pavgb %%xmm2,%%xmm0 \n" + "pavgb %%xmm3,%%xmm1 \n" + "psrlw $0x8,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "pand %%xmm5,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm1 \n" + "movq %%xmm0," MEMACCESS(1) " \n" + BUNDLEALIGN + MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2) + "lea " MEMLEA(0x8,1) ",%1 \n" + "sub $0x10,%3 \n" + "jg 1b \n" + : "+r"(src_yuy2), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(pix) // %3 + : "r"((intptr_t)(stride_yuy2)) // %4 + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" +#endif + ); +} + +void YUY2ToUV422Row_SSE2(const uint8* src_yuy2, + uint8* dst_u, uint8* dst_v, int pix) { + asm volatile ( + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrlw $0x8,%%xmm5 \n" + "sub %1,%2 \n" + LABELALIGN + "1: \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "lea " MEMLEA(0x20,0) ",%0 \n" + "psrlw $0x8,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "pand %%xmm5,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm1 \n" + "movq %%xmm0," MEMACCESS(1) " \n" + BUNDLEALIGN + MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2) + "lea " MEMLEA(0x8,1) ",%1 \n" + "sub $0x10,%3 \n" + "jg 1b \n" + : "+r"(src_yuy2), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(pix) // %3 + : + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm5" +#endif + ); +} + +void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2, + uint8* dst_y, int pix) { + asm volatile ( + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrlw $0x8,%%xmm5 \n" + LABELALIGN + "1: \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "lea " MEMLEA(0x20,0) ",%0 \n" + "pand %%xmm5,%%xmm0 \n" + "pand %%xmm5,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "sub $0x10,%2 \n" + "movdqu %%xmm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x10,1) ",%1 \n" + "jg 1b \n" + : "+r"(src_yuy2), // %0 + "+r"(dst_y), // %1 + "+r"(pix) // %2 + : + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm5" +#endif + ); +} + +void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2, + int stride_yuy2, + uint8* dst_u, uint8* dst_v, int pix) { + asm volatile ( + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrlw $0x8,%%xmm5 \n" + "sub %1,%2 \n" + LABELALIGN + "1: \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" + BUNDLEALIGN + MEMOPREG(movdqu,0x00,0,4,1,xmm2) // movdqu (%0,%4,1),%%xmm2 + MEMOPREG(movdqu,0x10,0,4,1,xmm3) // movdqu 0x10(%0,%4,1),%%xmm3 + "lea " MEMLEA(0x20,0) ",%0 \n" + "pavgb %%xmm2,%%xmm0 \n" + "pavgb %%xmm3,%%xmm1 \n" + "psrlw $0x8,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "pand %%xmm5,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm1 \n" + "movq %%xmm0," MEMACCESS(1) " \n" + BUNDLEALIGN + MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2) + "lea " MEMLEA(0x8,1) ",%1 \n" + "sub $0x10,%3 \n" + "jg 1b \n" + : "+r"(src_yuy2), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(pix) // %3 + : "r"((intptr_t)(stride_yuy2)) // %4 + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" +#endif + ); +} + +void YUY2ToUV422Row_Unaligned_SSE2(const uint8* src_yuy2, + uint8* dst_u, uint8* dst_v, int pix) { + asm volatile ( + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrlw $0x8,%%xmm5 \n" + "sub %1,%2 \n" + LABELALIGN + "1: \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "lea " MEMLEA(0x20,0) ",%0 \n" + "psrlw $0x8,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "pand %%xmm5,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm1 \n" + "movq %%xmm0," MEMACCESS(1) " \n" + BUNDLEALIGN + MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2) + "lea " MEMLEA(0x8,1) ",%1 \n" + "sub $0x10,%3 \n" + "jg 1b \n" + : "+r"(src_yuy2), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(pix) // %3 + : + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm5" +#endif + ); +} + +void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix) { + asm volatile ( + LABELALIGN + "1: \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "lea " MEMLEA(0x20,0) ",%0 \n" + "psrlw $0x8,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "sub $0x10,%2 \n" + "movdqa %%xmm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x10,1) ",%1 \n" + "jg 1b \n" + : "+r"(src_uyvy), // %0 + "+r"(dst_y), // %1 + "+r"(pix) // %2 + : + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1" +#endif + ); +} + +void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy, + uint8* dst_u, uint8* dst_v, int pix) { + asm volatile ( + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrlw $0x8,%%xmm5 \n" + "sub %1,%2 \n" + LABELALIGN + "1: \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" + BUNDLEALIGN + MEMOPREG(movdqa,0x00,0,4,1,xmm2) // movdqa (%0,%4,1),%%xmm2 + MEMOPREG(movdqa,0x10,0,4,1,xmm3) // movdqa 0x10(%0,%4,1),%%xmm3 + "lea " MEMLEA(0x20,0) ",%0 \n" + "pavgb %%xmm2,%%xmm0 \n" + "pavgb %%xmm3,%%xmm1 \n" + "pand %%xmm5,%%xmm0 \n" + "pand %%xmm5,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "pand %%xmm5,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm1 \n" + "movq %%xmm0," MEMACCESS(1) " \n" + BUNDLEALIGN + MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2) + "lea " MEMLEA(0x8,1) ",%1 \n" + "sub $0x10,%3 \n" + "jg 1b \n" + : "+r"(src_uyvy), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(pix) // %3 + : "r"((intptr_t)(stride_uyvy)) // %4 + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" +#endif + ); +} + +void UYVYToUV422Row_SSE2(const uint8* src_uyvy, + uint8* dst_u, uint8* dst_v, int pix) { + asm volatile ( + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrlw $0x8,%%xmm5 \n" + "sub %1,%2 \n" + LABELALIGN + "1: \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "lea " MEMLEA(0x20,0) ",%0 \n" + "pand %%xmm5,%%xmm0 \n" + "pand %%xmm5,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "pand %%xmm5,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm1 \n" + "movq %%xmm0," MEMACCESS(1) " \n" + BUNDLEALIGN + MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2) + "lea " MEMLEA(0x8,1) ",%1 \n" + "sub $0x10,%3 \n" + "jg 1b \n" + : "+r"(src_uyvy), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(pix) // %3 + : + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm5" +#endif + ); +} + +void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy, + uint8* dst_y, int pix) { + asm volatile ( + LABELALIGN + "1: \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "lea " MEMLEA(0x20,0) ",%0 \n" + "psrlw $0x8,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "sub $0x10,%2 \n" + "movdqu %%xmm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x10,1) ",%1 \n" + "jg 1b \n" + : "+r"(src_uyvy), // %0 + "+r"(dst_y), // %1 + "+r"(pix) // %2 + : + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1" +#endif + ); +} + +void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy, + uint8* dst_u, uint8* dst_v, int pix) { + asm volatile ( + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrlw $0x8,%%xmm5 \n" + "sub %1,%2 \n" + LABELALIGN + "1: \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" + BUNDLEALIGN + MEMOPREG(movdqu,0x00,0,4,1,xmm2) // movdqu (%0,%4,1),%%xmm2 + MEMOPREG(movdqu,0x10,0,4,1,xmm3) // movdqu 0x10(%0,%4,1),%%xmm3 + "lea " MEMLEA(0x20,0) ",%0 \n" + "pavgb %%xmm2,%%xmm0 \n" + "pavgb %%xmm3,%%xmm1 \n" + "pand %%xmm5,%%xmm0 \n" + "pand %%xmm5,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "pand %%xmm5,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm1 \n" + "movq %%xmm0," MEMACCESS(1) " \n" + BUNDLEALIGN + MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2) + "lea " MEMLEA(0x8,1) ",%1 \n" + "sub $0x10,%3 \n" + "jg 1b \n" + : "+r"(src_uyvy), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(pix) // %3 + : "r"((intptr_t)(stride_uyvy)) // %4 + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" +#endif + ); +} + +void UYVYToUV422Row_Unaligned_SSE2(const uint8* src_uyvy, + uint8* dst_u, uint8* dst_v, int pix) { + asm volatile ( + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrlw $0x8,%%xmm5 \n" + "sub %1,%2 \n" + LABELALIGN + "1: \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "lea " MEMLEA(0x20,0) ",%0 \n" + "pand %%xmm5,%%xmm0 \n" + "pand %%xmm5,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "pand %%xmm5,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm1 \n" + "movq %%xmm0," MEMACCESS(1) " \n" + BUNDLEALIGN + MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2) + "lea " MEMLEA(0x8,1) ",%1 \n" + "sub $0x10,%3 \n" + "jg 1b \n" + : "+r"(src_uyvy), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(pix) // %3 + : + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm5" +#endif + ); +} +#endif // HAS_YUY2TOYROW_SSE2 + +#ifdef HAS_ARGBBLENDROW_SSE2 +// Blend 8 pixels at a time. +void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, + uint8* dst_argb, int width) { + asm volatile ( + "pcmpeqb %%xmm7,%%xmm7 \n" + "psrlw $0xf,%%xmm7 \n" + "pcmpeqb %%xmm6,%%xmm6 \n" + "psrlw $0x8,%%xmm6 \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + "psllw $0x8,%%xmm5 \n" + "pcmpeqb %%xmm4,%%xmm4 \n" + "pslld $0x18,%%xmm4 \n" + "sub $0x1,%3 \n" + "je 91f \n" + "jl 99f \n" + + // 1 pixel loop until destination pointer is aligned. + "10: \n" + "test $0xf,%2 \n" + "je 19f \n" + "movd " MEMACCESS(0) ",%%xmm3 \n" + "lea " MEMLEA(0x4,0) ",%0 \n" + "movdqa %%xmm3,%%xmm0 \n" + "pxor %%xmm4,%%xmm3 \n" + "movd " MEMACCESS(1) ",%%xmm2 \n" + "psrlw $0x8,%%xmm3 \n" + "pshufhw $0xf5,%%xmm3,%%xmm3 \n" + "pshuflw $0xf5,%%xmm3,%%xmm3 \n" + "pand %%xmm6,%%xmm2 \n" + "paddw %%xmm7,%%xmm3 \n" + "pmullw %%xmm3,%%xmm2 \n" + "movd " MEMACCESS(1) ",%%xmm1 \n" + "lea " MEMLEA(0x4,1) ",%1 \n" + "psrlw $0x8,%%xmm1 \n" + "por %%xmm4,%%xmm0 \n" + "pmullw %%xmm3,%%xmm1 \n" + "psrlw $0x8,%%xmm2 \n" + "paddusb %%xmm2,%%xmm0 \n" + "pand %%xmm5,%%xmm1 \n" + "paddusb %%xmm1,%%xmm0 \n" + "sub $0x1,%3 \n" + "movd %%xmm0," MEMACCESS(2) " \n" + "lea " MEMLEA(0x4,2) ",%2 \n" + "jge 10b \n" + + "19: \n" + "add $1-4,%3 \n" + "jl 49f \n" + + // 4 pixel loop. + LABELALIGN + "41: \n" + "movdqu " MEMACCESS(0) ",%%xmm3 \n" + "lea " MEMLEA(0x10,0) ",%0 \n" + "movdqa %%xmm3,%%xmm0 \n" + "pxor %%xmm4,%%xmm3 \n" + "movdqu " MEMACCESS(1) ",%%xmm2 \n" + "psrlw $0x8,%%xmm3 \n" + "pshufhw $0xf5,%%xmm3,%%xmm3 \n" + "pshuflw $0xf5,%%xmm3,%%xmm3 \n" + "pand %%xmm6,%%xmm2 \n" + "paddw %%xmm7,%%xmm3 \n" + "pmullw %%xmm3,%%xmm2 \n" + "movdqu " MEMACCESS(1) ",%%xmm1 \n" + "lea " MEMLEA(0x10,1) ",%1 \n" + "psrlw $0x8,%%xmm1 \n" + "por %%xmm4,%%xmm0 \n" + "pmullw %%xmm3,%%xmm1 \n" + "psrlw $0x8,%%xmm2 \n" + "paddusb %%xmm2,%%xmm0 \n" + "pand %%xmm5,%%xmm1 \n" + "paddusb %%xmm1,%%xmm0 \n" + "sub $0x4,%3 \n" + "movdqa %%xmm0," MEMACCESS(2) " \n" + "lea " MEMLEA(0x10,2) ",%2 \n" + "jge 41b \n" + + "49: \n" + "add $0x3,%3 \n" + "jl 99f \n" + + // 1 pixel loop. + "91: \n" + "movd " MEMACCESS(0) ",%%xmm3 \n" + "lea " MEMLEA(0x4,0) ",%0 \n" + "movdqa %%xmm3,%%xmm0 \n" + "pxor %%xmm4,%%xmm3 \n" + "movd " MEMACCESS(1) ",%%xmm2 \n" + "psrlw $0x8,%%xmm3 \n" + "pshufhw $0xf5,%%xmm3,%%xmm3 \n" + "pshuflw $0xf5,%%xmm3,%%xmm3 \n" + "pand %%xmm6,%%xmm2 \n" + "paddw %%xmm7,%%xmm3 \n" + "pmullw %%xmm3,%%xmm2 \n" + "movd " MEMACCESS(1) ",%%xmm1 \n" + "lea " MEMLEA(0x4,1) ",%1 \n" + "psrlw $0x8,%%xmm1 \n" + "por %%xmm4,%%xmm0 \n" + "pmullw %%xmm3,%%xmm1 \n" + "psrlw $0x8,%%xmm2 \n" + "paddusb %%xmm2,%%xmm0 \n" + "pand %%xmm5,%%xmm1 \n" + "paddusb %%xmm1,%%xmm0 \n" + "sub $0x1,%3 \n" + "movd %%xmm0," MEMACCESS(2) " \n" + "lea " MEMLEA(0x4,2) ",%2 \n" + "jge 91b \n" + "99: \n" + : "+r"(src_argb0), // %0 + "+r"(src_argb1), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" +#endif + ); +} +#endif // HAS_ARGBBLENDROW_SSE2 + +#ifdef HAS_ARGBBLENDROW_SSSE3 +// Shuffle table for isolating alpha. +static uvec8 kShuffleAlpha = { + 3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80, + 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80 +}; + +// Blend 8 pixels at a time +// Shuffle table for reversing the bytes. + +// Same as SSE2, but replaces +// psrlw xmm3, 8 // alpha +// pshufhw xmm3, xmm3,0F5h // 8 alpha words +// pshuflw xmm3, xmm3,0F5h +// with.. +// pshufb xmm3, kShuffleAlpha // alpha + +void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1, + uint8* dst_argb, int width) { + asm volatile ( + "pcmpeqb %%xmm7,%%xmm7 \n" + "psrlw $0xf,%%xmm7 \n" + "pcmpeqb %%xmm6,%%xmm6 \n" + "psrlw $0x8,%%xmm6 \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + "psllw $0x8,%%xmm5 \n" + "pcmpeqb %%xmm4,%%xmm4 \n" + "pslld $0x18,%%xmm4 \n" + "sub $0x1,%3 \n" + "je 91f \n" + "jl 99f \n" + + // 1 pixel loop until destination pointer is aligned. + "10: \n" + "test $0xf,%2 \n" + "je 19f \n" + "movd " MEMACCESS(0) ",%%xmm3 \n" + "lea " MEMLEA(0x4,0) ",%0 \n" + "movdqa %%xmm3,%%xmm0 \n" + "pxor %%xmm4,%%xmm3 \n" + "movd " MEMACCESS(1) ",%%xmm2 \n" + "pshufb %4,%%xmm3 \n" + "pand %%xmm6,%%xmm2 \n" + "paddw %%xmm7,%%xmm3 \n" + "pmullw %%xmm3,%%xmm2 \n" + "movd " MEMACCESS(1) ",%%xmm1 \n" + "lea " MEMLEA(0x4,1) ",%1 \n" + "psrlw $0x8,%%xmm1 \n" + "por %%xmm4,%%xmm0 \n" + "pmullw %%xmm3,%%xmm1 \n" + "psrlw $0x8,%%xmm2 \n" + "paddusb %%xmm2,%%xmm0 \n" + "pand %%xmm5,%%xmm1 \n" + "paddusb %%xmm1,%%xmm0 \n" + "sub $0x1,%3 \n" + "movd %%xmm0," MEMACCESS(2) " \n" + "lea " MEMLEA(0x4,2) ",%2 \n" + "jge 10b \n" + + "19: \n" + "add $1-4,%3 \n" + "jl 49f \n" + "test $0xf,%0 \n" + "jne 41f \n" + "test $0xf,%1 \n" + "jne 41f \n" + + // 4 pixel loop. + LABELALIGN + "40: \n" + "movdqa " MEMACCESS(0) ",%%xmm3 \n" + "lea " MEMLEA(0x10,0) ",%0 \n" + "movdqa %%xmm3,%%xmm0 \n" + "pxor %%xmm4,%%xmm3 \n" + "movdqa " MEMACCESS(1) ",%%xmm2 \n" + "pshufb %4,%%xmm3 \n" + "pand %%xmm6,%%xmm2 \n" + "paddw %%xmm7,%%xmm3 \n" + "pmullw %%xmm3,%%xmm2 \n" + "movdqa " MEMACCESS(1) ",%%xmm1 \n" + "lea " MEMLEA(0x10,1) ",%1 \n" + "psrlw $0x8,%%xmm1 \n" + "por %%xmm4,%%xmm0 \n" + "pmullw %%xmm3,%%xmm1 \n" + "psrlw $0x8,%%xmm2 \n" + "paddusb %%xmm2,%%xmm0 \n" + "pand %%xmm5,%%xmm1 \n" + "paddusb %%xmm1,%%xmm0 \n" + "sub $0x4,%3 \n" + "movdqa %%xmm0," MEMACCESS(2) " \n" + "lea " MEMLEA(0x10,2) ",%2 \n" + "jge 40b \n" + "jmp 49f \n" + + // 4 pixel unaligned loop. + LABELALIGN + "41: \n" + "movdqu " MEMACCESS(0) ",%%xmm3 \n" + "lea " MEMLEA(0x10,0) ",%0 \n" + "movdqa %%xmm3,%%xmm0 \n" + "pxor %%xmm4,%%xmm3 \n" + "movdqu " MEMACCESS(1) ",%%xmm2 \n" + "pshufb %4,%%xmm3 \n" + "pand %%xmm6,%%xmm2 \n" + "paddw %%xmm7,%%xmm3 \n" + "pmullw %%xmm3,%%xmm2 \n" + "movdqu " MEMACCESS(1) ",%%xmm1 \n" + "lea " MEMLEA(0x10,1) ",%1 \n" + "psrlw $0x8,%%xmm1 \n" + "por %%xmm4,%%xmm0 \n" + "pmullw %%xmm3,%%xmm1 \n" + "psrlw $0x8,%%xmm2 \n" + "paddusb %%xmm2,%%xmm0 \n" + "pand %%xmm5,%%xmm1 \n" + "paddusb %%xmm1,%%xmm0 \n" + "sub $0x4,%3 \n" + "movdqa %%xmm0," MEMACCESS(2) " \n" + "lea " MEMLEA(0x10,2) ",%2 \n" + "jge 41b \n" + + "49: \n" + "add $0x3,%3 \n" + "jl 99f \n" + + // 1 pixel loop. + "91: \n" + "movd " MEMACCESS(0) ",%%xmm3 \n" + "lea " MEMLEA(0x4,0) ",%0 \n" + "movdqa %%xmm3,%%xmm0 \n" + "pxor %%xmm4,%%xmm3 \n" + "movd " MEMACCESS(1) ",%%xmm2 \n" + "pshufb %4,%%xmm3 \n" + "pand %%xmm6,%%xmm2 \n" + "paddw %%xmm7,%%xmm3 \n" + "pmullw %%xmm3,%%xmm2 \n" + "movd " MEMACCESS(1) ",%%xmm1 \n" + "lea " MEMLEA(0x4,1) ",%1 \n" + "psrlw $0x8,%%xmm1 \n" + "por %%xmm4,%%xmm0 \n" + "pmullw %%xmm3,%%xmm1 \n" + "psrlw $0x8,%%xmm2 \n" + "paddusb %%xmm2,%%xmm0 \n" + "pand %%xmm5,%%xmm1 \n" + "paddusb %%xmm1,%%xmm0 \n" + "sub $0x1,%3 \n" + "movd %%xmm0," MEMACCESS(2) " \n" + "lea " MEMLEA(0x4,2) ",%2 \n" + "jge 91b \n" + "99: \n" + : "+r"(src_argb0), // %0 + "+r"(src_argb1), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : "m"(kShuffleAlpha) // %4 + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" +#endif + ); +} +#endif // HAS_ARGBBLENDROW_SSSE3 + +#ifdef HAS_ARGBATTENUATEROW_SSE2 +// Attenuate 4 pixels at a time. +// aligned to 16 bytes +void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) { + asm volatile ( + "pcmpeqb %%xmm4,%%xmm4 \n" + "pslld $0x18,%%xmm4 \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrld $0x8,%%xmm5 \n" + + // 4 pixel loop. + LABELALIGN + "1: \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "punpcklbw %%xmm0,%%xmm0 \n" + "pshufhw $0xff,%%xmm0,%%xmm2 \n" + "pshuflw $0xff,%%xmm2,%%xmm2 \n" + "pmulhuw %%xmm2,%%xmm0 \n" + "movdqa " MEMACCESS(0) ",%%xmm1 \n" + "punpckhbw %%xmm1,%%xmm1 \n" + "pshufhw $0xff,%%xmm1,%%xmm2 \n" + "pshuflw $0xff,%%xmm2,%%xmm2 \n" + "pmulhuw %%xmm2,%%xmm1 \n" + "movdqa " MEMACCESS(0) ",%%xmm2 \n" + "lea " MEMLEA(0x10,0) ",%0 \n" + "psrlw $0x8,%%xmm0 \n" + "pand %%xmm4,%%xmm2 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "pand %%xmm5,%%xmm0 \n" + "por %%xmm2,%%xmm0 \n" + "sub $0x4,%2 \n" + "movdqa %%xmm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x10,1) ",%1 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" +#endif + ); +} +#endif // HAS_ARGBATTENUATEROW_SSE2 + +#ifdef HAS_ARGBATTENUATEROW_SSSE3 +// Shuffle table duplicating alpha +static uvec8 kShuffleAlpha0 = { + 3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u, +}; +static uvec8 kShuffleAlpha1 = { + 11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u, + 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u, +}; +// Attenuate 4 pixels at a time. +// aligned to 16 bytes +void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { + asm volatile ( + "pcmpeqb %%xmm3,%%xmm3 \n" + "pslld $0x18,%%xmm3 \n" + "movdqa %3,%%xmm4 \n" + "movdqa %4,%%xmm5 \n" + + // 4 pixel loop. + LABELALIGN + "1: \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "pshufb %%xmm4,%%xmm0 \n" + "movdqu " MEMACCESS(0) ",%%xmm1 \n" + "punpcklbw %%xmm1,%%xmm1 \n" + "pmulhuw %%xmm1,%%xmm0 \n" + "movdqu " MEMACCESS(0) ",%%xmm1 \n" + "pshufb %%xmm5,%%xmm1 \n" + "movdqu " MEMACCESS(0) ",%%xmm2 \n" + "punpckhbw %%xmm2,%%xmm2 \n" + "pmulhuw %%xmm2,%%xmm1 \n" + "movdqu " MEMACCESS(0) ",%%xmm2 \n" + "lea " MEMLEA(0x10,0) ",%0 \n" + "pand %%xmm3,%%xmm2 \n" + "psrlw $0x8,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "por %%xmm2,%%xmm0 \n" + "sub $0x4,%2 \n" + "movdqu %%xmm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x10,1) ",%1 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "m"(kShuffleAlpha0), // %3 + "m"(kShuffleAlpha1) // %4 + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" +#endif + ); +} +#endif // HAS_ARGBATTENUATEROW_SSSE3 + +#ifdef HAS_ARGBUNATTENUATEROW_SSE2 +// Unattenuate 4 pixels at a time. +// aligned to 16 bytes +void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, + int width) { + uintptr_t alpha = 0; + asm volatile ( + // 4 pixel loop. + LABELALIGN + "1: \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movzb " MEMACCESS2(0x03,0) ",%3 \n" + "punpcklbw %%xmm0,%%xmm0 \n" + MEMOPREG(movd,0x00,4,3,4,xmm2) // movd 0x0(%4,%3,4),%%xmm2 + "movzb " MEMACCESS2(0x07,0) ",%3 \n" + MEMOPREG(movd,0x00,4,3,4,xmm3) // movd 0x0(%4,%3,4),%%xmm3 + "pshuflw $0x40,%%xmm2,%%xmm2 \n" + "pshuflw $0x40,%%xmm3,%%xmm3 \n" + "movlhps %%xmm3,%%xmm2 \n" + "pmulhuw %%xmm2,%%xmm0 \n" + "movdqu " MEMACCESS(0) ",%%xmm1 \n" + "movzb " MEMACCESS2(0x0b,0) ",%3 \n" + "punpckhbw %%xmm1,%%xmm1 \n" + BUNDLEALIGN + MEMOPREG(movd,0x00,4,3,4,xmm2) // movd 0x0(%4,%3,4),%%xmm2 + "movzb " MEMACCESS2(0x0f,0) ",%3 \n" + MEMOPREG(movd,0x00,4,3,4,xmm3) // movd 0x0(%4,%3,4),%%xmm3 + "pshuflw $0x40,%%xmm2,%%xmm2 \n" + "pshuflw $0x40,%%xmm3,%%xmm3 \n" + "movlhps %%xmm3,%%xmm2 \n" + "pmulhuw %%xmm2,%%xmm1 \n" + "lea " MEMLEA(0x10,0) ",%0 \n" + "packuswb %%xmm1,%%xmm0 \n" + "sub $0x4,%2 \n" + "movdqu %%xmm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x10,1) ",%1 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width), // %2 + "+r"(alpha) // %3 + : "r"(fixed_invtbl8) // %4 + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" +#endif + ); +} +#endif // HAS_ARGBUNATTENUATEROW_SSE2 + +#ifdef HAS_ARGBGRAYROW_SSSE3 +// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels +void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { + asm volatile ( + "movdqa %3,%%xmm4 \n" + "movdqa %4,%%xmm5 \n" + + // 8 pixel loop. + LABELALIGN + "1: \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm1 \n" + "phaddw %%xmm1,%%xmm0 \n" + "paddw %%xmm5,%%xmm0 \n" + "psrlw $0x7,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movdqa " MEMACCESS(0) ",%%xmm2 \n" + "movdqa " MEMACCESS2(0x10,0) ",%%xmm3 \n" + "lea " MEMLEA(0x20,0) ",%0 \n" + "psrld $0x18,%%xmm2 \n" + "psrld $0x18,%%xmm3 \n" + "packuswb %%xmm3,%%xmm2 \n" + "packuswb %%xmm2,%%xmm2 \n" + "movdqa %%xmm0,%%xmm3 \n" + "punpcklbw %%xmm0,%%xmm0 \n" + "punpcklbw %%xmm2,%%xmm3 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklwd %%xmm3,%%xmm0 \n" + "punpckhwd %%xmm3,%%xmm1 \n" + "sub $0x8,%2 \n" + "movdqa %%xmm0," MEMACCESS(1) " \n" + "movdqa %%xmm1," MEMACCESS2(0x10,1) " \n" + "lea " MEMLEA(0x20,1) ",%1 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "m"(kARGBToYJ), // %3 + "m"(kAddYJ64) // %4 + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" +#endif + ); +} +#endif // HAS_ARGBGRAYROW_SSSE3 + +#ifdef HAS_ARGBSEPIAROW_SSSE3 +// b = (r * 35 + g * 68 + b * 17) >> 7 +// g = (r * 45 + g * 88 + b * 22) >> 7 +// r = (r * 50 + g * 98 + b * 24) >> 7 +// Constant for ARGB color to sepia tone +static vec8 kARGBToSepiaB = { + 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0 +}; + +static vec8 kARGBToSepiaG = { + 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0 +}; + +static vec8 kARGBToSepiaR = { + 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0 +}; + +// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels. +void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) { + asm volatile ( + "movdqa %2,%%xmm2 \n" + "movdqa %3,%%xmm3 \n" + "movdqa %4,%%xmm4 \n" + + // 8 pixel loop. + LABELALIGN + "1: \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "movdqa " MEMACCESS2(0x10,0) ",%%xmm6 \n" + "pmaddubsw %%xmm2,%%xmm0 \n" + "pmaddubsw %%xmm2,%%xmm6 \n" + "phaddw %%xmm6,%%xmm0 \n" + "psrlw $0x7,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movdqa " MEMACCESS(0) ",%%xmm5 \n" + "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "pmaddubsw %%xmm3,%%xmm5 \n" + "pmaddubsw %%xmm3,%%xmm1 \n" + "phaddw %%xmm1,%%xmm5 \n" + "psrlw $0x7,%%xmm5 \n" + "packuswb %%xmm5,%%xmm5 \n" + "punpcklbw %%xmm5,%%xmm0 \n" + "movdqa " MEMACCESS(0) ",%%xmm5 \n" + "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "pmaddubsw %%xmm4,%%xmm5 \n" + "pmaddubsw %%xmm4,%%xmm1 \n" + "phaddw %%xmm1,%%xmm5 \n" + "psrlw $0x7,%%xmm5 \n" + "packuswb %%xmm5,%%xmm5 \n" + "movdqa " MEMACCESS(0) ",%%xmm6 \n" + "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "psrld $0x18,%%xmm6 \n" + "psrld $0x18,%%xmm1 \n" + "packuswb %%xmm1,%%xmm6 \n" + "packuswb %%xmm6,%%xmm6 \n" + "punpcklbw %%xmm6,%%xmm5 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklwd %%xmm5,%%xmm0 \n" + "punpckhwd %%xmm5,%%xmm1 \n" + "sub $0x8,%1 \n" + "movdqa %%xmm0," MEMACCESS(0) " \n" + "movdqa %%xmm1," MEMACCESS2(0x10,0) " \n" + "lea " MEMLEA(0x20,0) ",%0 \n" + "jg 1b \n" + : "+r"(dst_argb), // %0 + "+r"(width) // %1 + : "m"(kARGBToSepiaB), // %2 + "m"(kARGBToSepiaG), // %3 + "m"(kARGBToSepiaR) // %4 + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" +#endif + ); +} +#endif // HAS_ARGBSEPIAROW_SSSE3 + +#ifdef HAS_ARGBCOLORMATRIXROW_SSSE3 +// Tranform 8 ARGB pixels (32 bytes) with color matrix. +// Same as Sepia except matrix is provided. +void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb, + const int8* matrix_argb, int width) { + asm volatile ( + "movdqu " MEMACCESS(3) ",%%xmm5 \n" + "pshufd $0x00,%%xmm5,%%xmm2 \n" + "pshufd $0x55,%%xmm5,%%xmm3 \n" + "pshufd $0xaa,%%xmm5,%%xmm4 \n" + "pshufd $0xff,%%xmm5,%%xmm5 \n" + + // 8 pixel loop. + LABELALIGN + "1: \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "movdqa " MEMACCESS2(0x10,0) ",%%xmm7 \n" + "pmaddubsw %%xmm2,%%xmm0 \n" + "pmaddubsw %%xmm2,%%xmm7 \n" + "movdqa " MEMACCESS(0) ",%%xmm6 \n" + "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "pmaddubsw %%xmm3,%%xmm6 \n" + "pmaddubsw %%xmm3,%%xmm1 \n" + "phaddsw %%xmm7,%%xmm0 \n" + "phaddsw %%xmm1,%%xmm6 \n" + "psraw $0x6,%%xmm0 \n" + "psraw $0x6,%%xmm6 \n" + "packuswb %%xmm0,%%xmm0 \n" + "packuswb %%xmm6,%%xmm6 \n" + "punpcklbw %%xmm6,%%xmm0 \n" + "movdqa " MEMACCESS(0) ",%%xmm1 \n" + "movdqa " MEMACCESS2(0x10,0) ",%%xmm7 \n" + "pmaddubsw %%xmm4,%%xmm1 \n" + "pmaddubsw %%xmm4,%%xmm7 \n" + "phaddsw %%xmm7,%%xmm1 \n" + "movdqa " MEMACCESS(0) ",%%xmm6 \n" + "movdqa " MEMACCESS2(0x10,0) ",%%xmm7 \n" + "pmaddubsw %%xmm5,%%xmm6 \n" + "pmaddubsw %%xmm5,%%xmm7 \n" + "phaddsw %%xmm7,%%xmm6 \n" + "psraw $0x6,%%xmm1 \n" + "psraw $0x6,%%xmm6 \n" + "packuswb %%xmm1,%%xmm1 \n" + "packuswb %%xmm6,%%xmm6 \n" + "punpcklbw %%xmm6,%%xmm1 \n" + "movdqa %%xmm0,%%xmm6 \n" + "punpcklwd %%xmm1,%%xmm0 \n" + "punpckhwd %%xmm1,%%xmm6 \n" + "sub $0x8,%2 \n" + "movdqa %%xmm0," MEMACCESS(1) " \n" + "movdqa %%xmm6," MEMACCESS2(0x10,1) " \n" + "lea " MEMLEA(0x20,0) ",%0 \n" + "lea " MEMLEA(0x20,1) ",%1 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "r"(matrix_argb) // %3 + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" +#endif + ); +} +#endif // HAS_ARGBCOLORMATRIXROW_SSSE3 + +#ifdef HAS_ARGBQUANTIZEROW_SSE2 +// Quantize 4 ARGB pixels (16 bytes). +// aligned to 16 bytes +void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size, + int interval_offset, int width) { + asm volatile ( + "movd %2,%%xmm2 \n" + "movd %3,%%xmm3 \n" + "movd %4,%%xmm4 \n" + "pshuflw $0x40,%%xmm2,%%xmm2 \n" + "pshufd $0x44,%%xmm2,%%xmm2 \n" + "pshuflw $0x40,%%xmm3,%%xmm3 \n" + "pshufd $0x44,%%xmm3,%%xmm3 \n" + "pshuflw $0x40,%%xmm4,%%xmm4 \n" + "pshufd $0x44,%%xmm4,%%xmm4 \n" + "pxor %%xmm5,%%xmm5 \n" + "pcmpeqb %%xmm6,%%xmm6 \n" + "pslld $0x18,%%xmm6 \n" + + // 4 pixel loop. + LABELALIGN + "1: \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "punpcklbw %%xmm5,%%xmm0 \n" + "pmulhuw %%xmm2,%%xmm0 \n" + "movdqa " MEMACCESS(0) ",%%xmm1 \n" + "punpckhbw %%xmm5,%%xmm1 \n" + "pmulhuw %%xmm2,%%xmm1 \n" + "pmullw %%xmm3,%%xmm0 \n" + "movdqa " MEMACCESS(0) ",%%xmm7 \n" + "pmullw %%xmm3,%%xmm1 \n" + "pand %%xmm6,%%xmm7 \n" + "paddw %%xmm4,%%xmm0 \n" + "paddw %%xmm4,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "por %%xmm7,%%xmm0 \n" + "sub $0x4,%1 \n" + "movdqa %%xmm0," MEMACCESS(0) " \n" + "lea " MEMLEA(0x10,0) ",%0 \n" + "jg 1b \n" + : "+r"(dst_argb), // %0 + "+r"(width) // %1 + : "r"(scale), // %2 + "r"(interval_size), // %3 + "r"(interval_offset) // %4 + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" +#endif + ); +} +#endif // HAS_ARGBQUANTIZEROW_SSE2 + +#ifdef HAS_ARGBSHADEROW_SSE2 +// Shade 4 pixels at a time by specified value. +// Aligned to 16 bytes. +void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width, + uint32 value) { + asm volatile ( + "movd %3,%%xmm2 \n" + "punpcklbw %%xmm2,%%xmm2 \n" + "punpcklqdq %%xmm2,%%xmm2 \n" + + // 4 pixel loop. + LABELALIGN + "1: \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "lea " MEMLEA(0x10,0) ",%0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklbw %%xmm0,%%xmm0 \n" + "punpckhbw %%xmm1,%%xmm1 \n" + "pmulhuw %%xmm2,%%xmm0 \n" + "pmulhuw %%xmm2,%%xmm1 \n" + "psrlw $0x8,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "sub $0x4,%2 \n" + "movdqa %%xmm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x10,1) ",%1 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "r"(value) // %3 + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2" +#endif + ); +} +#endif // HAS_ARGBSHADEROW_SSE2 + +#ifdef HAS_ARGBMULTIPLYROW_SSE2 +// Multiply 2 rows of ARGB pixels together, 4 pixels at a time. +void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, + uint8* dst_argb, int width) { + asm volatile ( + "pxor %%xmm5,%%xmm5 \n" + + // 4 pixel loop. + LABELALIGN + "1: \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "lea " MEMLEA(0x10,0) ",%0 \n" + "movdqu " MEMACCESS(1) ",%%xmm2 \n" + "lea " MEMLEA(0x10,1) ",%1 \n" + "movdqu %%xmm0,%%xmm1 \n" + "movdqu %%xmm2,%%xmm3 \n" + "punpcklbw %%xmm0,%%xmm0 \n" + "punpckhbw %%xmm1,%%xmm1 \n" + "punpcklbw %%xmm5,%%xmm2 \n" + "punpckhbw %%xmm5,%%xmm3 \n" + "pmulhuw %%xmm2,%%xmm0 \n" + "pmulhuw %%xmm3,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "sub $0x4,%3 \n" + "movdqu %%xmm0," MEMACCESS(2) " \n" + "lea " MEMLEA(0x10,2) ",%2 \n" + "jg 1b \n" + : "+r"(src_argb0), // %0 + "+r"(src_argb1), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" +#endif + ); +} +#endif // HAS_ARGBMULTIPLYROW_SSE2 + +#ifdef HAS_ARGBADDROW_SSE2 +// Add 2 rows of ARGB pixels together, 4 pixels at a time. +void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, + uint8* dst_argb, int width) { + asm volatile ( + // 4 pixel loop. + LABELALIGN + "1: \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "lea " MEMLEA(0x10,0) ",%0 \n" + "movdqu " MEMACCESS(1) ",%%xmm1 \n" + "lea " MEMLEA(0x10,1) ",%1 \n" + "paddusb %%xmm1,%%xmm0 \n" + "sub $0x4,%3 \n" + "movdqu %%xmm0," MEMACCESS(2) " \n" + "lea " MEMLEA(0x10,2) ",%2 \n" + "jg 1b \n" + : "+r"(src_argb0), // %0 + "+r"(src_argb1), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1" +#endif + ); +} +#endif // HAS_ARGBADDROW_SSE2 + +#ifdef HAS_ARGBSUBTRACTROW_SSE2 +// Subtract 2 rows of ARGB pixels, 4 pixels at a time. +void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, + uint8* dst_argb, int width) { + asm volatile ( + // 4 pixel loop. + LABELALIGN + "1: \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "lea " MEMLEA(0x10,0) ",%0 \n" + "movdqu " MEMACCESS(1) ",%%xmm1 \n" + "lea " MEMLEA(0x10,1) ",%1 \n" + "psubusb %%xmm1,%%xmm0 \n" + "sub $0x4,%3 \n" + "movdqu %%xmm0," MEMACCESS(2) " \n" + "lea " MEMLEA(0x10,2) ",%2 \n" + "jg 1b \n" + : "+r"(src_argb0), // %0 + "+r"(src_argb1), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1" +#endif + ); +} +#endif // HAS_ARGBSUBTRACTROW_SSE2 + +#ifdef HAS_SOBELXROW_SSE2 +// SobelX as a matrix is +// -1 0 1 +// -2 0 2 +// -1 0 1 +void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1, + const uint8* src_y2, uint8* dst_sobelx, int width) { + asm volatile ( + "sub %0,%1 \n" + "sub %0,%2 \n" + "sub %0,%3 \n" + "pxor %%xmm5,%%xmm5 \n" + + // 8 pixel loop. + LABELALIGN + "1: \n" + "movq " MEMACCESS(0) ",%%xmm0 \n" + "movq " MEMACCESS2(0x2,0) ",%%xmm1 \n" + "punpcklbw %%xmm5,%%xmm0 \n" + "punpcklbw %%xmm5,%%xmm1 \n" + "psubw %%xmm1,%%xmm0 \n" + BUNDLEALIGN + MEMOPREG(movq,0x00,0,1,1,xmm1) // movq (%0,%1,1),%%xmm1 + MEMOPREG(movq,0x02,0,1,1,xmm2) // movq 0x2(%0,%1,1),%%xmm2 + "punpcklbw %%xmm5,%%xmm1 \n" + "punpcklbw %%xmm5,%%xmm2 \n" + "psubw %%xmm2,%%xmm1 \n" + BUNDLEALIGN + MEMOPREG(movq,0x00,0,2,1,xmm2) // movq (%0,%2,1),%%xmm2 + MEMOPREG(movq,0x02,0,2,1,xmm3) // movq 0x2(%0,%2,1),%%xmm3 + "punpcklbw %%xmm5,%%xmm2 \n" + "punpcklbw %%xmm5,%%xmm3 \n" + "psubw %%xmm3,%%xmm2 \n" + "paddw %%xmm2,%%xmm0 \n" + "paddw %%xmm1,%%xmm0 \n" + "paddw %%xmm1,%%xmm0 \n" + "pxor %%xmm1,%%xmm1 \n" + "psubw %%xmm0,%%xmm1 \n" + "pmaxsw %%xmm1,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "sub $0x8,%4 \n" + BUNDLEALIGN + MEMOPMEM(movq,xmm0,0x00,0,3,1) // movq %%xmm0,(%0,%3,1) + "lea " MEMLEA(0x8,0) ",%0 \n" + "jg 1b \n" + : "+r"(src_y0), // %0 + "+r"(src_y1), // %1 + "+r"(src_y2), // %2 + "+r"(dst_sobelx), // %3 + "+r"(width) // %4 + : + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" +#endif + ); +} +#endif // HAS_SOBELXROW_SSE2 + +#ifdef HAS_SOBELYROW_SSE2 +// SobelY as a matrix is +// -1 -2 -1 +// 0 0 0 +// 1 2 1 +void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1, + uint8* dst_sobely, int width) { + asm volatile ( + "sub %0,%1 \n" + "sub %0,%2 \n" + "pxor %%xmm5,%%xmm5 \n" + + // 8 pixel loop. + LABELALIGN + "1: \n" + "movq " MEMACCESS(0) ",%%xmm0 \n" + MEMOPREG(movq,0x00,0,1,1,xmm1) // movq (%0,%1,1),%%xmm1 + "punpcklbw %%xmm5,%%xmm0 \n" + "punpcklbw %%xmm5,%%xmm1 \n" + "psubw %%xmm1,%%xmm0 \n" + BUNDLEALIGN + "movq " MEMACCESS2(0x1,0) ",%%xmm1 \n" + MEMOPREG(movq,0x01,0,1,1,xmm2) // movq 0x1(%0,%1,1),%%xmm2 + "punpcklbw %%xmm5,%%xmm1 \n" + "punpcklbw %%xmm5,%%xmm2 \n" + "psubw %%xmm2,%%xmm1 \n" + BUNDLEALIGN + "movq " MEMACCESS2(0x2,0) ",%%xmm2 \n" + MEMOPREG(movq,0x02,0,1,1,xmm3) // movq 0x2(%0,%1,1),%%xmm3 + "punpcklbw %%xmm5,%%xmm2 \n" + "punpcklbw %%xmm5,%%xmm3 \n" + "psubw %%xmm3,%%xmm2 \n" + "paddw %%xmm2,%%xmm0 \n" + "paddw %%xmm1,%%xmm0 \n" + "paddw %%xmm1,%%xmm0 \n" + "pxor %%xmm1,%%xmm1 \n" + "psubw %%xmm0,%%xmm1 \n" + "pmaxsw %%xmm1,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "sub $0x8,%3 \n" + BUNDLEALIGN + MEMOPMEM(movq,xmm0,0x00,0,2,1) // movq %%xmm0,(%0,%2,1) + "lea " MEMLEA(0x8,0) ",%0 \n" + "jg 1b \n" + : "+r"(src_y0), // %0 + "+r"(src_y1), // %1 + "+r"(dst_sobely), // %2 + "+r"(width) // %3 + : + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" +#endif + ); +} +#endif // HAS_SOBELYROW_SSE2 + +#ifdef HAS_SOBELROW_SSE2 +// Adds Sobel X and Sobel Y and stores Sobel into ARGB. +// A = 255 +// R = Sobel +// G = Sobel +// B = Sobel +void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, + uint8* dst_argb, int width) { + asm volatile ( + "sub %0,%1 \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + "pslld $0x18,%%xmm5 \n" + + // 8 pixel loop. + LABELALIGN + "1: \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + MEMOPREG(movdqa,0x00,0,1,1,xmm1) // movdqa (%0,%1,1),%%xmm1 + "lea " MEMLEA(0x10,0) ",%0 \n" + "paddusb %%xmm1,%%xmm0 \n" + "movdqa %%xmm0,%%xmm2 \n" + "punpcklbw %%xmm0,%%xmm2 \n" + "punpckhbw %%xmm0,%%xmm0 \n" + "movdqa %%xmm2,%%xmm1 \n" + "punpcklwd %%xmm2,%%xmm1 \n" + "punpckhwd %%xmm2,%%xmm2 \n" + "por %%xmm5,%%xmm1 \n" + "por %%xmm5,%%xmm2 \n" + "movdqa %%xmm0,%%xmm3 \n" + "punpcklwd %%xmm0,%%xmm3 \n" + "punpckhwd %%xmm0,%%xmm0 \n" + "por %%xmm5,%%xmm3 \n" + "por %%xmm5,%%xmm0 \n" + "sub $0x10,%3 \n" + "movdqa %%xmm1," MEMACCESS(2) " \n" + "movdqa %%xmm2," MEMACCESS2(0x10,2) " \n" + "movdqa %%xmm3," MEMACCESS2(0x20,2) " \n" + "movdqa %%xmm0," MEMACCESS2(0x30,2) " \n" + "lea " MEMLEA(0x40,2) ",%2 \n" + "jg 1b \n" + : "+r"(src_sobelx), // %0 + "+r"(src_sobely), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" +#endif + ); +} +#endif // HAS_SOBELROW_SSE2 + +#ifdef HAS_SOBELTOPLANEROW_SSE2 +// Adds Sobel X and Sobel Y and stores Sobel into a plane. +void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, + uint8* dst_y, int width) { + asm volatile ( + "sub %0,%1 \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + "pslld $0x18,%%xmm5 \n" + + // 8 pixel loop. + LABELALIGN + "1: \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + MEMOPREG(movdqa,0x00,0,1,1,xmm1) // movdqa (%0,%1,1),%%xmm1 + "lea " MEMLEA(0x10,0) ",%0 \n" + "paddusb %%xmm1,%%xmm0 \n" + "sub $0x10,%3 \n" + "movdqa %%xmm0," MEMACCESS(2) " \n" + "lea " MEMLEA(0x10,2) ",%2 \n" + "jg 1b \n" + : "+r"(src_sobelx), // %0 + "+r"(src_sobely), // %1 + "+r"(dst_y), // %2 + "+r"(width) // %3 + : + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1" +#endif + ); +} +#endif // HAS_SOBELTOPLANEROW_SSE2 + +#ifdef HAS_SOBELXYROW_SSE2 +// Mixes Sobel X, Sobel Y and Sobel into ARGB. +// A = 255 +// R = Sobel X +// G = Sobel +// B = Sobel Y +void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, + uint8* dst_argb, int width) { + asm volatile ( + "sub %0,%1 \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + + // 8 pixel loop. + LABELALIGN + "1: \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + MEMOPREG(movdqa,0x00,0,1,1,xmm1) // movdqa (%0,%1,1),%%xmm1 + "lea " MEMLEA(0x10,0) ",%0 \n" + "movdqa %%xmm0,%%xmm2 \n" + "paddusb %%xmm1,%%xmm2 \n" + "movdqa %%xmm0,%%xmm3 \n" + "punpcklbw %%xmm5,%%xmm3 \n" + "punpckhbw %%xmm5,%%xmm0 \n" + "movdqa %%xmm1,%%xmm4 \n" + "punpcklbw %%xmm2,%%xmm4 \n" + "punpckhbw %%xmm2,%%xmm1 \n" + "movdqa %%xmm4,%%xmm6 \n" + "punpcklwd %%xmm3,%%xmm6 \n" + "punpckhwd %%xmm3,%%xmm4 \n" + "movdqa %%xmm1,%%xmm7 \n" + "punpcklwd %%xmm0,%%xmm7 \n" + "punpckhwd %%xmm0,%%xmm1 \n" + "sub $0x10,%3 \n" + "movdqa %%xmm6," MEMACCESS(2) " \n" + "movdqa %%xmm4," MEMACCESS2(0x10,2) " \n" + "movdqa %%xmm7," MEMACCESS2(0x20,2) " \n" + "movdqa %%xmm1," MEMACCESS2(0x30,2) " \n" + "lea " MEMLEA(0x40,2) ",%2 \n" + "jg 1b \n" + : "+r"(src_sobelx), // %0 + "+r"(src_sobely), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" +#endif + ); +} +#endif // HAS_SOBELXYROW_SSE2 + +#ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2 +// Creates a table of cumulative sums where each value is a sum of all values +// above and to the left of the value, inclusive of the value. +void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum, + const int32* previous_cumsum, int width) { + asm volatile ( + "pxor %%xmm0,%%xmm0 \n" + "pxor %%xmm1,%%xmm1 \n" + "sub $0x4,%3 \n" + "jl 49f \n" + "test $0xf,%1 \n" + "jne 49f \n" + + // 4 pixel loop \n" + LABELALIGN + "40: \n" + "movdqu " MEMACCESS(0) ",%%xmm2 \n" + "lea " MEMLEA(0x10,0) ",%0 \n" + "movdqa %%xmm2,%%xmm4 \n" + "punpcklbw %%xmm1,%%xmm2 \n" + "movdqa %%xmm2,%%xmm3 \n" + "punpcklwd %%xmm1,%%xmm2 \n" + "punpckhwd %%xmm1,%%xmm3 \n" + "punpckhbw %%xmm1,%%xmm4 \n" + "movdqa %%xmm4,%%xmm5 \n" + "punpcklwd %%xmm1,%%xmm4 \n" + "punpckhwd %%xmm1,%%xmm5 \n" + "paddd %%xmm2,%%xmm0 \n" + "movdqa " MEMACCESS(2) ",%%xmm2 \n" + "paddd %%xmm0,%%xmm2 \n" + "paddd %%xmm3,%%xmm0 \n" + "movdqa " MEMACCESS2(0x10,2) ",%%xmm3 \n" + "paddd %%xmm0,%%xmm3 \n" + "paddd %%xmm4,%%xmm0 \n" + "movdqa " MEMACCESS2(0x20,2) ",%%xmm4 \n" + "paddd %%xmm0,%%xmm4 \n" + "paddd %%xmm5,%%xmm0 \n" + "movdqa " MEMACCESS2(0x30,2) ",%%xmm5 \n" + "lea " MEMLEA(0x40,2) ",%2 \n" + "paddd %%xmm0,%%xmm5 \n" + "movdqa %%xmm2," MEMACCESS(1) " \n" + "movdqa %%xmm3," MEMACCESS2(0x10,1) " \n" + "movdqa %%xmm4," MEMACCESS2(0x20,1) " \n" + "movdqa %%xmm5," MEMACCESS2(0x30,1) " \n" + "lea " MEMLEA(0x40,1) ",%1 \n" + "sub $0x4,%3 \n" + "jge 40b \n" + + "49: \n" + "add $0x3,%3 \n" + "jl 19f \n" + + // 1 pixel loop \n" + LABELALIGN + "10: \n" + "movd " MEMACCESS(0) ",%%xmm2 \n" + "lea " MEMLEA(0x4,0) ",%0 \n" + "punpcklbw %%xmm1,%%xmm2 \n" + "punpcklwd %%xmm1,%%xmm2 \n" + "paddd %%xmm2,%%xmm0 \n" + "movdqu " MEMACCESS(2) ",%%xmm2 \n" + "lea " MEMLEA(0x10,2) ",%2 \n" + "paddd %%xmm0,%%xmm2 \n" + "movdqu %%xmm2," MEMACCESS(1) " \n" + "lea " MEMLEA(0x10,1) ",%1 \n" + "sub $0x1,%3 \n" + "jge 10b \n" + + "19: \n" + : "+r"(row), // %0 + "+r"(cumsum), // %1 + "+r"(previous_cumsum), // %2 + "+r"(width) // %3 + : + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" +#endif + ); +} +#endif // HAS_COMPUTECUMULATIVESUMROW_SSE2 + +#ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2 +void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft, + int width, int area, uint8* dst, + int count) { + asm volatile ( + "movd %5,%%xmm5 \n" + "cvtdq2ps %%xmm5,%%xmm5 \n" + "rcpss %%xmm5,%%xmm4 \n" + "pshufd $0x0,%%xmm4,%%xmm4 \n" + "sub $0x4,%3 \n" + "jl 49f \n" + "cmpl $0x80,%5 \n" + "ja 40f \n" + + "pshufd $0x0,%%xmm5,%%xmm5 \n" + "pcmpeqb %%xmm6,%%xmm6 \n" + "psrld $0x10,%%xmm6 \n" + "cvtdq2ps %%xmm6,%%xmm6 \n" + "addps %%xmm6,%%xmm5 \n" + "mulps %%xmm4,%%xmm5 \n" + "cvtps2dq %%xmm5,%%xmm5 \n" + "packssdw %%xmm5,%%xmm5 \n" + + // 4 pixel small loop \n" + LABELALIGN + "4: \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n" + "movdqa " MEMACCESS2(0x30,0) ",%%xmm3 \n" + BUNDLEALIGN + MEMOPREG(psubd,0x00,0,4,4,xmm0) // psubd 0x00(%0,%4,4),%%xmm0 + MEMOPREG(psubd,0x10,0,4,4,xmm1) // psubd 0x10(%0,%4,4),%%xmm1 + MEMOPREG(psubd,0x20,0,4,4,xmm2) // psubd 0x20(%0,%4,4),%%xmm2 + MEMOPREG(psubd,0x30,0,4,4,xmm3) // psubd 0x30(%0,%4,4),%%xmm3 + "lea " MEMLEA(0x40,0) ",%0 \n" + "psubd " MEMACCESS(1) ",%%xmm0 \n" + "psubd " MEMACCESS2(0x10,1) ",%%xmm1 \n" + "psubd " MEMACCESS2(0x20,1) ",%%xmm2 \n" + "psubd " MEMACCESS2(0x30,1) ",%%xmm3 \n" + BUNDLEALIGN + MEMOPREG(paddd,0x00,1,4,4,xmm0) // paddd 0x00(%1,%4,4),%%xmm0 + MEMOPREG(paddd,0x10,1,4,4,xmm1) // paddd 0x10(%1,%4,4),%%xmm1 + MEMOPREG(paddd,0x20,1,4,4,xmm2) // paddd 0x20(%1,%4,4),%%xmm2 + MEMOPREG(paddd,0x30,1,4,4,xmm3) // paddd 0x30(%1,%4,4),%%xmm3 + "lea " MEMLEA(0x40,1) ",%1 \n" + "packssdw %%xmm1,%%xmm0 \n" + "packssdw %%xmm3,%%xmm2 \n" + "pmulhuw %%xmm5,%%xmm0 \n" + "pmulhuw %%xmm5,%%xmm2 \n" + "packuswb %%xmm2,%%xmm0 \n" + "movdqu %%xmm0," MEMACCESS(2) " \n" + "lea " MEMLEA(0x10,2) ",%2 \n" + "sub $0x4,%3 \n" + "jge 4b \n" + "jmp 49f \n" + + // 4 pixel loop \n" + LABELALIGN + "40: \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n" + "movdqa " MEMACCESS2(0x30,0) ",%%xmm3 \n" + BUNDLEALIGN + MEMOPREG(psubd,0x00,0,4,4,xmm0) // psubd 0x00(%0,%4,4),%%xmm0 + MEMOPREG(psubd,0x10,0,4,4,xmm1) // psubd 0x10(%0,%4,4),%%xmm1 + MEMOPREG(psubd,0x20,0,4,4,xmm2) // psubd 0x20(%0,%4,4),%%xmm2 + MEMOPREG(psubd,0x30,0,4,4,xmm3) // psubd 0x30(%0,%4,4),%%xmm3 + "lea " MEMLEA(0x40,0) ",%0 \n" + "psubd " MEMACCESS(1) ",%%xmm0 \n" + "psubd " MEMACCESS2(0x10,1) ",%%xmm1 \n" + "psubd " MEMACCESS2(0x20,1) ",%%xmm2 \n" + "psubd " MEMACCESS2(0x30,1) ",%%xmm3 \n" + BUNDLEALIGN + MEMOPREG(paddd,0x00,1,4,4,xmm0) // paddd 0x00(%1,%4,4),%%xmm0 + MEMOPREG(paddd,0x10,1,4,4,xmm1) // paddd 0x10(%1,%4,4),%%xmm1 + MEMOPREG(paddd,0x20,1,4,4,xmm2) // paddd 0x20(%1,%4,4),%%xmm2 + MEMOPREG(paddd,0x30,1,4,4,xmm3) // paddd 0x30(%1,%4,4),%%xmm3 + "lea " MEMLEA(0x40,1) ",%1 \n" + "cvtdq2ps %%xmm0,%%xmm0 \n" + "cvtdq2ps %%xmm1,%%xmm1 \n" + "mulps %%xmm4,%%xmm0 \n" + "mulps %%xmm4,%%xmm1 \n" + "cvtdq2ps %%xmm2,%%xmm2 \n" + "cvtdq2ps %%xmm3,%%xmm3 \n" + "mulps %%xmm4,%%xmm2 \n" + "mulps %%xmm4,%%xmm3 \n" + "cvtps2dq %%xmm0,%%xmm0 \n" + "cvtps2dq %%xmm1,%%xmm1 \n" + "cvtps2dq %%xmm2,%%xmm2 \n" + "cvtps2dq %%xmm3,%%xmm3 \n" + "packssdw %%xmm1,%%xmm0 \n" + "packssdw %%xmm3,%%xmm2 \n" + "packuswb %%xmm2,%%xmm0 \n" + "movdqu %%xmm0," MEMACCESS(2) " \n" + "lea " MEMLEA(0x10,2) ",%2 \n" + "sub $0x4,%3 \n" + "jge 40b \n" + + "49: \n" + "add $0x3,%3 \n" + "jl 19f \n" + + // 1 pixel loop \n" + LABELALIGN + "10: \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + MEMOPREG(psubd,0x00,0,4,4,xmm0) // psubd 0x00(%0,%4,4),%%xmm0 + "lea " MEMLEA(0x10,0) ",%0 \n" + "psubd " MEMACCESS(1) ",%%xmm0 \n" + BUNDLEALIGN + MEMOPREG(paddd,0x00,1,4,4,xmm0) // paddd 0x00(%1,%4,4),%%xmm0 + "lea " MEMLEA(0x10,1) ",%1 \n" + "cvtdq2ps %%xmm0,%%xmm0 \n" + "mulps %%xmm4,%%xmm0 \n" + "cvtps2dq %%xmm0,%%xmm0 \n" + "packssdw %%xmm0,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movd %%xmm0," MEMACCESS(2) " \n" + "lea " MEMLEA(0x4,2) ",%2 \n" + "sub $0x1,%3 \n" + "jge 10b \n" + "19: \n" + : "+r"(topleft), // %0 + "+r"(botleft), // %1 + "+r"(dst), // %2 + "+rm"(count) // %3 + : "r"((intptr_t)(width)), // %4 + "rm"(area) // %5 + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" +#endif + ); +} +#endif // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2 + +#ifdef HAS_ARGBAFFINEROW_SSE2 +// Copy ARGB pixels from source image with slope to a row of destination. +LIBYUV_API +void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride, + uint8* dst_argb, const float* src_dudv, int width) { + intptr_t src_argb_stride_temp = src_argb_stride; + intptr_t temp = 0; + asm volatile ( + "movq " MEMACCESS(3) ",%%xmm2 \n" + "movq " MEMACCESS2(0x08,3) ",%%xmm7 \n" + "shl $0x10,%1 \n" + "add $0x4,%1 \n" + "movd %1,%%xmm5 \n" + "sub $0x4,%4 \n" + "jl 49f \n" + + "pshufd $0x44,%%xmm7,%%xmm7 \n" + "pshufd $0x0,%%xmm5,%%xmm5 \n" + "movdqa %%xmm2,%%xmm0 \n" + "addps %%xmm7,%%xmm0 \n" + "movlhps %%xmm0,%%xmm2 \n" + "movdqa %%xmm7,%%xmm4 \n" + "addps %%xmm4,%%xmm4 \n" + "movdqa %%xmm2,%%xmm3 \n" + "addps %%xmm4,%%xmm3 \n" + "addps %%xmm4,%%xmm4 \n" + + // 4 pixel loop \n" + LABELALIGN + "40: \n" + "cvttps2dq %%xmm2,%%xmm0 \n" // x, y float to int first 2 + "cvttps2dq %%xmm3,%%xmm1 \n" // x, y float to int next 2 + "packssdw %%xmm1,%%xmm0 \n" // x, y as 8 shorts + "pmaddwd %%xmm5,%%xmm0 \n" // off = x * 4 + y * stride + "movd %%xmm0,%k1 \n" + "pshufd $0x39,%%xmm0,%%xmm0 \n" + "movd %%xmm0,%k5 \n" + "pshufd $0x39,%%xmm0,%%xmm0 \n" + BUNDLEALIGN + MEMOPREG(movd,0x00,0,1,1,xmm1) // movd (%0,%1,1),%%xmm1 + MEMOPREG(movd,0x00,0,5,1,xmm6) // movd (%0,%5,1),%%xmm6 + "punpckldq %%xmm6,%%xmm1 \n" + "addps %%xmm4,%%xmm2 \n" + "movq %%xmm1," MEMACCESS(2) " \n" + "movd %%xmm0,%k1 \n" + "pshufd $0x39,%%xmm0,%%xmm0 \n" + "movd %%xmm0,%k5 \n" + BUNDLEALIGN + MEMOPREG(movd,0x00,0,1,1,xmm0) // movd (%0,%1,1),%%xmm0 + MEMOPREG(movd,0x00,0,5,1,xmm6) // movd (%0,%5,1),%%xmm6 + "punpckldq %%xmm6,%%xmm0 \n" + "addps %%xmm4,%%xmm3 \n" + "sub $0x4,%4 \n" + "movq %%xmm0," MEMACCESS2(0x08,2) " \n" + "lea " MEMLEA(0x10,2) ",%2 \n" + "jge 40b \n" + + "49: \n" + "add $0x3,%4 \n" + "jl 19f \n" + + // 1 pixel loop \n" + LABELALIGN + "10: \n" + "cvttps2dq %%xmm2,%%xmm0 \n" + "packssdw %%xmm0,%%xmm0 \n" + "pmaddwd %%xmm5,%%xmm0 \n" + "addps %%xmm7,%%xmm2 \n" + "movd %%xmm0,%k1 \n" + BUNDLEALIGN + MEMOPREG(movd,0x00,0,1,1,xmm0) // movd (%0,%1,1),%%xmm0 + "sub $0x1,%4 \n" + "movd %%xmm0," MEMACCESS(2) " \n" + "lea " MEMLEA(0x04,2) ",%2 \n" + "jge 10b \n" + "19: \n" + : "+r"(src_argb), // %0 + "+r"(src_argb_stride_temp), // %1 + "+r"(dst_argb), // %2 + "+r"(src_dudv), // %3 + "+rm"(width), // %4 + "+r"(temp) // %5 + : + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" +#endif + ); +} +#endif // HAS_ARGBAFFINEROW_SSE2 + +#ifdef HAS_INTERPOLATEROW_SSSE3 +// Bilinear filter 16x2 -> 16x1 +void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr, + ptrdiff_t src_stride, int dst_width, + int source_y_fraction) { + asm volatile ( + "sub %1,%0 \n" + "shr %3 \n" + "cmp $0x0,%3 \n" + "je 100f \n" + "cmp $0x20,%3 \n" + "je 75f \n" + "cmp $0x40,%3 \n" + "je 50f \n" + "cmp $0x60,%3 \n" + "je 25f \n" + + "movd %3,%%xmm0 \n" + "neg %3 \n" + "add $0x80,%3 \n" + "movd %3,%%xmm5 \n" + "punpcklbw %%xmm0,%%xmm5 \n" + "punpcklwd %%xmm5,%%xmm5 \n" + "pshufd $0x0,%%xmm5,%%xmm5 \n" + + // General purpose row blend. + LABELALIGN + "1: \n" + "movdqa " MEMACCESS(1) ",%%xmm0 \n" + MEMOPREG(movdqa,0x00,1,4,1,xmm2) + "movdqa %%xmm0,%%xmm1 \n" + "punpcklbw %%xmm2,%%xmm0 \n" + "punpckhbw %%xmm2,%%xmm1 \n" + "pmaddubsw %%xmm5,%%xmm0 \n" + "pmaddubsw %%xmm5,%%xmm1 \n" + "psrlw $0x7,%%xmm0 \n" + "psrlw $0x7,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "sub $0x10,%2 \n" + BUNDLEALIGN + MEMOPMEM(movdqa,xmm0,0x00,1,0,1) + "lea " MEMLEA(0x10,1) ",%1 \n" + "jg 1b \n" + "jmp 99f \n" + + // Blend 25 / 75. + LABELALIGN + "25: \n" + "movdqa " MEMACCESS(1) ",%%xmm0 \n" + MEMOPREG(movdqa,0x00,1,4,1,xmm1) + "pavgb %%xmm1,%%xmm0 \n" + "pavgb %%xmm1,%%xmm0 \n" + "sub $0x10,%2 \n" + BUNDLEALIGN + MEMOPMEM(movdqa,xmm0,0x00,1,0,1) + "lea " MEMLEA(0x10,1) ",%1 \n" + "jg 25b \n" + "jmp 99f \n" + + // Blend 50 / 50. + LABELALIGN + "50: \n" + "movdqa " MEMACCESS(1) ",%%xmm0 \n" + MEMOPREG(movdqa,0x00,1,4,1,xmm1) + "pavgb %%xmm1,%%xmm0 \n" + "sub $0x10,%2 \n" + BUNDLEALIGN + MEMOPMEM(movdqa,xmm0,0x00,1,0,1) + "lea " MEMLEA(0x10,1) ",%1 \n" + "jg 50b \n" + "jmp 99f \n" + + // Blend 75 / 25. + LABELALIGN + "75: \n" + "movdqa " MEMACCESS(1) ",%%xmm1 \n" + MEMOPREG(movdqa,0x00,1,4,1,xmm0) + "pavgb %%xmm1,%%xmm0 \n" + "pavgb %%xmm1,%%xmm0 \n" + "sub $0x10,%2 \n" + BUNDLEALIGN + MEMOPMEM(movdqa,xmm0,0x00,1,0,1) + "lea " MEMLEA(0x10,1) ",%1 \n" + "jg 75b \n" + "jmp 99f \n" + + // Blend 100 / 0 - Copy row unchanged. + LABELALIGN + "100: \n" + "movdqa " MEMACCESS(1) ",%%xmm0 \n" + "sub $0x10,%2 \n" + MEMOPMEM(movdqa,xmm0,0x00,1,0,1) + "lea " MEMLEA(0x10,1) ",%1 \n" + "jg 100b \n" + + "99: \n" + : "+r"(dst_ptr), // %0 + "+r"(src_ptr), // %1 + "+r"(dst_width), // %2 + "+r"(source_y_fraction) // %3 + : "r"((intptr_t)(src_stride)) // %4 + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm5" +#endif + ); +} +#endif // HAS_INTERPOLATEROW_SSSE3 + +#ifdef HAS_INTERPOLATEROW_SSE2 +// Bilinear filter 16x2 -> 16x1 +void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr, + ptrdiff_t src_stride, int dst_width, + int source_y_fraction) { + asm volatile ( + "sub %1,%0 \n" + "shr %3 \n" + "cmp $0x0,%3 \n" + "je 100f \n" + "cmp $0x20,%3 \n" + "je 75f \n" + "cmp $0x40,%3 \n" + "je 50f \n" + "cmp $0x60,%3 \n" + "je 25f \n" + + "movd %3,%%xmm0 \n" + "neg %3 \n" + "add $0x80,%3 \n" + "movd %3,%%xmm5 \n" + "punpcklbw %%xmm0,%%xmm5 \n" + "punpcklwd %%xmm5,%%xmm5 \n" + "pshufd $0x0,%%xmm5,%%xmm5 \n" + "pxor %%xmm4,%%xmm4 \n" + + // General purpose row blend. + LABELALIGN + "1: \n" + "movdqa " MEMACCESS(1) ",%%xmm0 \n" + MEMOPREG(movdqa,0x00,1,4,1,xmm2) // movdqa (%1,%4,1),%%xmm2 + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm2,%%xmm3 \n" + "punpcklbw %%xmm4,%%xmm2 \n" + "punpckhbw %%xmm4,%%xmm3 \n" + "punpcklbw %%xmm4,%%xmm0 \n" + "punpckhbw %%xmm4,%%xmm1 \n" + "psubw %%xmm0,%%xmm2 \n" + "psubw %%xmm1,%%xmm3 \n" + "paddw %%xmm2,%%xmm2 \n" + "paddw %%xmm3,%%xmm3 \n" + "pmulhw %%xmm5,%%xmm2 \n" + "pmulhw %%xmm5,%%xmm3 \n" + "paddw %%xmm2,%%xmm0 \n" + "paddw %%xmm3,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "sub $0x10,%2 \n" + BUNDLEALIGN + MEMOPMEM(movdqa,xmm0,0x00,1,0,1) // movdqa %%xmm0,(%1,%0,1) + "lea " MEMLEA(0x10,1) ",%1 \n" + "jg 1b \n" + "jmp 99f \n" + + // Blend 25 / 75. + LABELALIGN + "25: \n" + "movdqa " MEMACCESS(1) ",%%xmm0 \n" + MEMOPREG(movdqa,0x00,1,4,1,xmm1) // movdqa (%1,%4,1),%%xmm1 + "pavgb %%xmm1,%%xmm0 \n" + "pavgb %%xmm1,%%xmm0 \n" + "sub $0x10,%2 \n" + BUNDLEALIGN + MEMOPMEM(movdqa,xmm0,0x00,1,0,1) // movdqa %%xmm0,(%1,%0,1) + "lea " MEMLEA(0x10,1) ",%1 \n" + "jg 25b \n" + "jmp 99f \n" + + // Blend 50 / 50. + LABELALIGN + "50: \n" + "movdqa " MEMACCESS(1) ",%%xmm0 \n" + MEMOPREG(movdqa,0x00,1,4,1,xmm1) // movdqa (%1,%4,1),%%xmm1 + "pavgb %%xmm1,%%xmm0 \n" + "sub $0x10,%2 \n" + BUNDLEALIGN + MEMOPMEM(movdqa,xmm0,0x00,1,0,1) // movdqa %%xmm0,(%1,%0,1) + "lea " MEMLEA(0x10,1) ",%1 \n" + "jg 50b \n" + "jmp 99f \n" + + // Blend 75 / 25. + LABELALIGN + "75: \n" + "movdqa " MEMACCESS(1) ",%%xmm1 \n" + MEMOPREG(movdqa,0x00,1,4,1,xmm0) // movdqa (%1,%4,1),%%xmm0 + "pavgb %%xmm1,%%xmm0 \n" + "pavgb %%xmm1,%%xmm0 \n" + "sub $0x10,%2 \n" + BUNDLEALIGN + MEMOPMEM(movdqa,xmm0,0x00,1,0,1) // movdqa %%xmm0,(%1,%0,1) + "lea " MEMLEA(0x10,1) ",%1 \n" + "jg 75b \n" + "jmp 99f \n" + + // Blend 100 / 0 - Copy row unchanged. + LABELALIGN + "100: \n" + "movdqa " MEMACCESS(1) ",%%xmm0 \n" + "sub $0x10,%2 \n" + MEMOPMEM(movdqa,xmm0,0x00,1,0,1) // movdqa %%xmm0,(%1,%0,1) + "lea " MEMLEA(0x10,1) ",%1 \n" + "jg 100b \n" + + "99: \n" + : "+r"(dst_ptr), // %0 + "+r"(src_ptr), // %1 + "+r"(dst_width), // %2 + "+r"(source_y_fraction) // %3 + : "r"((intptr_t)(src_stride)) // %4 + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" +#endif + ); +} +#endif // HAS_INTERPOLATEROW_SSE2 + +#ifdef HAS_INTERPOLATEROW_SSSE3 +// Bilinear filter 16x2 -> 16x1 +void InterpolateRow_Unaligned_SSSE3(uint8* dst_ptr, const uint8* src_ptr, + ptrdiff_t src_stride, int dst_width, + int source_y_fraction) { + asm volatile ( + "sub %1,%0 \n" + "shr %3 \n" + "cmp $0x0,%3 \n" + "je 100f \n" + "cmp $0x20,%3 \n" + "je 75f \n" + "cmp $0x40,%3 \n" + "je 50f \n" + "cmp $0x60,%3 \n" + "je 25f \n" + + "movd %3,%%xmm0 \n" + "neg %3 \n" + "add $0x80,%3 \n" + "movd %3,%%xmm5 \n" + "punpcklbw %%xmm0,%%xmm5 \n" + "punpcklwd %%xmm5,%%xmm5 \n" + "pshufd $0x0,%%xmm5,%%xmm5 \n" + + // General purpose row blend. + LABELALIGN + "1: \n" + "movdqu " MEMACCESS(1) ",%%xmm0 \n" + MEMOPREG(movdqu,0x00,1,4,1,xmm2) + "movdqu %%xmm0,%%xmm1 \n" + "punpcklbw %%xmm2,%%xmm0 \n" + "punpckhbw %%xmm2,%%xmm1 \n" + "pmaddubsw %%xmm5,%%xmm0 \n" + "pmaddubsw %%xmm5,%%xmm1 \n" + "psrlw $0x7,%%xmm0 \n" + "psrlw $0x7,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "sub $0x10,%2 \n" + BUNDLEALIGN + MEMOPMEM(movdqu,xmm0,0x00,1,0,1) + "lea " MEMLEA(0x10,1) ",%1 \n" + "jg 1b \n" + "jmp 99f \n" + + // Blend 25 / 75. + LABELALIGN + "25: \n" + "movdqu " MEMACCESS(1) ",%%xmm0 \n" + MEMOPREG(movdqu,0x00,1,4,1,xmm1) + "pavgb %%xmm1,%%xmm0 \n" + "pavgb %%xmm1,%%xmm0 \n" + "sub $0x10,%2 \n" + BUNDLEALIGN + MEMOPMEM(movdqu,xmm0,0x00,1,0,1) + "lea " MEMLEA(0x10,1) ",%1 \n" + "jg 25b \n" + "jmp 99f \n" + + // Blend 50 / 50. + LABELALIGN + "50: \n" + "movdqu " MEMACCESS(1) ",%%xmm0 \n" + MEMOPREG(movdqu,0x00,1,4,1,xmm1) + "pavgb %%xmm1,%%xmm0 \n" + "sub $0x10,%2 \n" + BUNDLEALIGN + MEMOPMEM(movdqu,xmm0,0x00,1,0,1) + "lea " MEMLEA(0x10,1) ",%1 \n" + "jg 50b \n" + "jmp 99f \n" + + // Blend 75 / 25. + LABELALIGN + "75: \n" + "movdqu " MEMACCESS(1) ",%%xmm1 \n" + MEMOPREG(movdqu,0x00,1,4,1,xmm0) + "pavgb %%xmm1,%%xmm0 \n" + "pavgb %%xmm1,%%xmm0 \n" + "sub $0x10,%2 \n" + BUNDLEALIGN + MEMOPMEM(movdqu,xmm0,0x00,1,0,1) + "lea " MEMLEA(0x10,1) ",%1 \n" + "jg 75b \n" + "jmp 99f \n" + + // Blend 100 / 0 - Copy row unchanged. + LABELALIGN + "100: \n" + "movdqu " MEMACCESS(1) ",%%xmm0 \n" + "sub $0x10,%2 \n" + MEMOPMEM(movdqu,xmm0,0x00,1,0,1) + "lea " MEMLEA(0x10,1) ",%1 \n" + "jg 100b \n" + + "99: \n" + : "+r"(dst_ptr), // %0 + "+r"(src_ptr), // %1 + "+r"(dst_width), // %2 + "+r"(source_y_fraction) // %3 + : "r"((intptr_t)(src_stride)) // %4 + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm5" +#endif + ); +} +#endif // HAS_INTERPOLATEROW_SSSE3 + +#ifdef HAS_INTERPOLATEROW_SSE2 +// Bilinear filter 16x2 -> 16x1 +void InterpolateRow_Unaligned_SSE2(uint8* dst_ptr, const uint8* src_ptr, + ptrdiff_t src_stride, int dst_width, + int source_y_fraction) { + asm volatile ( + "sub %1,%0 \n" + "shr %3 \n" + "cmp $0x0,%3 \n" + "je 100f \n" + "cmp $0x20,%3 \n" + "je 75f \n" + "cmp $0x40,%3 \n" + "je 50f \n" + "cmp $0x60,%3 \n" + "je 25f \n" + + "movd %3,%%xmm0 \n" + "neg %3 \n" + "add $0x80,%3 \n" + "movd %3,%%xmm5 \n" + "punpcklbw %%xmm0,%%xmm5 \n" + "punpcklwd %%xmm5,%%xmm5 \n" + "pshufd $0x0,%%xmm5,%%xmm5 \n" + "pxor %%xmm4,%%xmm4 \n" + + // General purpose row blend. + LABELALIGN + "1: \n" + "movdqu " MEMACCESS(1) ",%%xmm0 \n" + MEMOPREG(movdqu,0x00,1,4,1,xmm2) // movdqu (%1,%4,1),%%xmm2 + "movdqu %%xmm0,%%xmm1 \n" + "movdqu %%xmm2,%%xmm3 \n" + "punpcklbw %%xmm4,%%xmm2 \n" + "punpckhbw %%xmm4,%%xmm3 \n" + "punpcklbw %%xmm4,%%xmm0 \n" + "punpckhbw %%xmm4,%%xmm1 \n" + "psubw %%xmm0,%%xmm2 \n" + "psubw %%xmm1,%%xmm3 \n" + "paddw %%xmm2,%%xmm2 \n" + "paddw %%xmm3,%%xmm3 \n" + "pmulhw %%xmm5,%%xmm2 \n" + "pmulhw %%xmm5,%%xmm3 \n" + "paddw %%xmm2,%%xmm0 \n" + "paddw %%xmm3,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "sub $0x10,%2 \n" + BUNDLEALIGN + MEMOPMEM(movdqu,xmm0,0x00,1,0,1) // movdqu %%xmm0,(%1,%0,1) + "lea " MEMLEA(0x10,1) ",%1 \n" + "jg 1b \n" + "jmp 99f \n" + + // Blend 25 / 75. + LABELALIGN + "25: \n" + "movdqu " MEMACCESS(1) ",%%xmm0 \n" + MEMOPREG(movdqu,0x00,1,4,1,xmm1) // movdqu (%1,%4,1),%%xmm1 + "pavgb %%xmm1,%%xmm0 \n" + "pavgb %%xmm1,%%xmm0 \n" + "sub $0x10,%2 \n" + BUNDLEALIGN + MEMOPMEM(movdqu,xmm0,0x00,1,0,1) // movdqu %%xmm0,(%1,%0,1) + "lea " MEMLEA(0x10,1) ",%1 \n" + "jg 25b \n" + "jmp 99f \n" + + // Blend 50 / 50. + LABELALIGN + "50: \n" + "movdqu " MEMACCESS(1) ",%%xmm0 \n" + MEMOPREG(movdqu,0x00,1,4,1,xmm1) // movdqu (%1,%4,1),%%xmm1 + "pavgb %%xmm1,%%xmm0 \n" + "sub $0x10,%2 \n" + BUNDLEALIGN + MEMOPMEM(movdqu,xmm0,0x00,1,0,1) // movdqu %%xmm0,(%1,%0,1) + "lea " MEMLEA(0x10,1) ",%1 \n" + "jg 50b \n" + "jmp 99f \n" + + // Blend 75 / 25. + LABELALIGN + "75: \n" + "movdqu " MEMACCESS(1) ",%%xmm1 \n" + MEMOPREG(movdqu,0x00,1,4,1,xmm0) // movdqu (%1,%4,1),%%xmm0 + "pavgb %%xmm1,%%xmm0 \n" + "pavgb %%xmm1,%%xmm0 \n" + "sub $0x10,%2 \n" + BUNDLEALIGN + MEMOPMEM(movdqu,xmm0,0x00,1,0,1) // movdqu %%xmm0,(%1,%0,1) + "lea " MEMLEA(0x10,1) ",%1 \n" + "jg 75b \n" + "jmp 99f \n" + + // Blend 100 / 0 - Copy row unchanged. + LABELALIGN + "100: \n" + "movdqu " MEMACCESS(1) ",%%xmm0 \n" + "sub $0x10,%2 \n" + MEMOPMEM(movdqu,xmm0,0x00,1,0,1) // movdqu %%xmm0,(%1,%0,1) + "lea " MEMLEA(0x10,1) ",%1 \n" + "jg 100b \n" + + "99: \n" + : "+r"(dst_ptr), // %0 + "+r"(src_ptr), // %1 + "+r"(dst_width), // %2 + "+r"(source_y_fraction) // %3 + : "r"((intptr_t)(src_stride)) // %4 + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" +#endif + ); +} +#endif // HAS_INTERPOLATEROW_SSE2 + +#ifdef HAS_HALFROW_SSE2 +void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride, + uint8* dst_uv, int pix) { + asm volatile ( + "sub %0,%1 \n" + LABELALIGN + "1: \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + MEMOPREG(pavgb,0x00,0,3,1,xmm0) // pavgb (%0,%3),%%xmm0 + "sub $0x10,%2 \n" + MEMOPMEM(movdqa,xmm0,0x00,0,1,1) // movdqa %%xmm0,(%0,%1) + "lea " MEMLEA(0x10,0) ",%0 \n" + "jg 1b \n" + : "+r"(src_uv), // %0 + "+r"(dst_uv), // %1 + "+r"(pix) // %2 + : "r"((intptr_t)(src_uv_stride)) // %3 + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0" +#endif + ); +} +#endif // HAS_HALFROW_SSE2 + +#ifdef HAS_ARGBTOBAYERROW_SSSE3 +void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer, + uint32 selector, int pix) { + asm volatile ( + // NaCL caveat - assumes movd is from GPR + "movd %3,%%xmm5 \n" + "pshufd $0x0,%%xmm5,%%xmm5 \n" + LABELALIGN + "1: \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "lea " MEMLEA(0x20,0) ",%0 \n" + "pshufb %%xmm5,%%xmm0 \n" + "pshufb %%xmm5,%%xmm1 \n" + "punpckldq %%xmm1,%%xmm0 \n" + "sub $0x8,%2 \n" + "movq %%xmm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x8,1) ",%1 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_bayer), // %1 + "+r"(pix) // %2 + : "g"(selector) // %3 + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm5" +#endif + ); +} +#endif // HAS_ARGBTOBAYERROW_SSSE3 + +#ifdef HAS_ARGBTOBAYERGGROW_SSE2 +void ARGBToBayerGGRow_SSE2(const uint8* src_argb, uint8* dst_bayer, + uint32 selector, int pix) { + asm volatile ( + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrld $0x18,%%xmm5 \n" + LABELALIGN + "1: \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "lea " MEMLEA(0x20,0) ",%0 \n" + "psrld $0x8,%%xmm0 \n" + "psrld $0x8,%%xmm1 \n" + "pand %%xmm5,%%xmm0 \n" + "pand %%xmm5,%%xmm1 \n" + "packssdw %%xmm1,%%xmm0 \n" + "packuswb %%xmm1,%%xmm0 \n" + "sub $0x8,%2 \n" + "movq %%xmm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x8,1) ",%1 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_bayer), // %1 + "+r"(pix) // %2 + : + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm5" +#endif + ); +} +#endif // HAS_ARGBTOBAYERGGROW_SSE2 + +#ifdef HAS_ARGBSHUFFLEROW_SSSE3 +// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. +void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb, + const uint8* shuffler, int pix) { + asm volatile ( + "movdqa " MEMACCESS(3) ",%%xmm5 \n" + LABELALIGN + "1: \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "lea " MEMLEA(0x20,0) ",%0 \n" + "pshufb %%xmm5,%%xmm0 \n" + "pshufb %%xmm5,%%xmm1 \n" + "sub $0x8,%2 \n" + "movdqa %%xmm0," MEMACCESS(1) " \n" + "movdqa %%xmm1," MEMACCESS2(0x10,1) " \n" + "lea " MEMLEA(0x20,1) ",%1 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(pix) // %2 + : "r"(shuffler) // %3 + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm5" +#endif + ); +} + +void ARGBShuffleRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_argb, + const uint8* shuffler, int pix) { + asm volatile ( + "movdqa " MEMACCESS(3) ",%%xmm5 \n" + LABELALIGN + "1: \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "lea " MEMLEA(0x20,0) ",%0 \n" + "pshufb %%xmm5,%%xmm0 \n" + "pshufb %%xmm5,%%xmm1 \n" + "sub $0x8,%2 \n" + "movdqu %%xmm0," MEMACCESS(1) " \n" + "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" + "lea " MEMLEA(0x20,1) ",%1 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(pix) // %2 + : "r"(shuffler) // %3 + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm5" +#endif + ); +} +#endif // HAS_ARGBSHUFFLEROW_SSSE3 + +#ifdef HAS_ARGBSHUFFLEROW_AVX2 +// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. +void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb, + const uint8* shuffler, int pix) { + asm volatile ( + "vbroadcastf128 " MEMACCESS(3) ",%%ymm5 \n" + LABELALIGN + "1: \n" + "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" + "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" + "lea " MEMLEA(0x40,0) ",%0 \n" + "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" + "vpshufb %%ymm5,%%ymm1,%%ymm1 \n" + "sub $0x10,%2 \n" + "vmovdqu %%ymm0," MEMACCESS(1) " \n" + "vmovdqu %%ymm1," MEMACCESS2(0x20,1) " \n" + "lea " MEMLEA(0x40,1) ",%1 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(pix) // %2 + : "r"(shuffler) // %3 + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm5" +#endif + ); +} +#endif // HAS_ARGBSHUFFLEROW_AVX2 + +#ifdef HAS_ARGBSHUFFLEROW_SSE2 +// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. +void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb, + const uint8* shuffler, int pix) { + uintptr_t pixel_temp = 0u; + asm volatile ( + "pxor %%xmm5,%%xmm5 \n" + "mov " MEMACCESS(4) ",%k2 \n" + "cmp $0x3000102,%k2 \n" + "je 3012f \n" + "cmp $0x10203,%k2 \n" + "je 123f \n" + "cmp $0x30201,%k2 \n" + "je 321f \n" + "cmp $0x2010003,%k2 \n" + "je 2103f \n" + + LABELALIGN + "1: \n" + "movzb " MEMACCESS(4) ",%2 \n" + MEMOPARG(movzb,0x00,0,2,1,2) " \n" // movzb (%0,%2,1),%2 + "mov %b2," MEMACCESS(1) " \n" + "movzb " MEMACCESS2(0x1,4) ",%2 \n" + MEMOPARG(movzb,0x00,0,2,1,2) " \n" // movzb (%0,%2,1),%2 + "mov %b2," MEMACCESS2(0x1,1) " \n" + BUNDLEALIGN + "movzb " MEMACCESS2(0x2,4) ",%2 \n" + MEMOPARG(movzb,0x00,0,2,1,2) " \n" // movzb (%0,%2,1),%2 + "mov %b2," MEMACCESS2(0x2,1) " \n" + "movzb " MEMACCESS2(0x3,4) ",%2 \n" + MEMOPARG(movzb,0x00,0,2,1,2) " \n" // movzb (%0,%2,1),%2 + "mov %b2," MEMACCESS2(0x3,1) " \n" + "lea " MEMLEA(0x4,0) ",%0 \n" + "lea " MEMLEA(0x4,1) ",%1 \n" + "sub $0x1,%3 \n" + "jg 1b \n" + "jmp 99f \n" + + LABELALIGN + "123: \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "lea " MEMLEA(0x10,0) ",%0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklbw %%xmm5,%%xmm0 \n" + "punpckhbw %%xmm5,%%xmm1 \n" + "pshufhw $0x1b,%%xmm0,%%xmm0 \n" + "pshuflw $0x1b,%%xmm0,%%xmm0 \n" + "pshufhw $0x1b,%%xmm1,%%xmm1 \n" + "pshuflw $0x1b,%%xmm1,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "sub $0x4,%3 \n" + "movdqu %%xmm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x10,1) ",%1 \n" + "jg 123b \n" + "jmp 99f \n" + + LABELALIGN + "321: \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "lea " MEMLEA(0x10,0) ",%0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklbw %%xmm5,%%xmm0 \n" + "punpckhbw %%xmm5,%%xmm1 \n" + "pshufhw $0x39,%%xmm0,%%xmm0 \n" + "pshuflw $0x39,%%xmm0,%%xmm0 \n" + "pshufhw $0x39,%%xmm1,%%xmm1 \n" + "pshuflw $0x39,%%xmm1,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "sub $0x4,%3 \n" + "movdqu %%xmm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x10,1) ",%1 \n" + "jg 321b \n" + "jmp 99f \n" + + LABELALIGN + "2103: \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "lea " MEMLEA(0x10,0) ",%0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklbw %%xmm5,%%xmm0 \n" + "punpckhbw %%xmm5,%%xmm1 \n" + "pshufhw $0x93,%%xmm0,%%xmm0 \n" + "pshuflw $0x93,%%xmm0,%%xmm0 \n" + "pshufhw $0x93,%%xmm1,%%xmm1 \n" + "pshuflw $0x93,%%xmm1,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "sub $0x4,%3 \n" + "movdqu %%xmm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x10,1) ",%1 \n" + "jg 2103b \n" + "jmp 99f \n" + + LABELALIGN + "3012: \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "lea " MEMLEA(0x10,0) ",%0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklbw %%xmm5,%%xmm0 \n" + "punpckhbw %%xmm5,%%xmm1 \n" + "pshufhw $0xc6,%%xmm0,%%xmm0 \n" + "pshuflw $0xc6,%%xmm0,%%xmm0 \n" + "pshufhw $0xc6,%%xmm1,%%xmm1 \n" + "pshuflw $0xc6,%%xmm1,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "sub $0x4,%3 \n" + "movdqu %%xmm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x10,1) ",%1 \n" + "jg 3012b \n" + + "99: \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+d"(pixel_temp), // %2 + "+r"(pix) // %3 + : "r"(shuffler) // %4 + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm5" +#endif + ); +} +#endif // HAS_ARGBSHUFFLEROW_SSE2 + +#ifdef HAS_I422TOYUY2ROW_SSE2 +void I422ToYUY2Row_SSE2(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_frame, int width) { + asm volatile ( + "sub %1,%2 \n" + LABELALIGN + "1: \n" + "movq " MEMACCESS(1) ",%%xmm2 \n" + MEMOPREG(movq,0x00,1,2,1,xmm3) // movq (%1,%2,1),%%xmm3 + "lea " MEMLEA(0x8,1) ",%1 \n" + "punpcklbw %%xmm3,%%xmm2 \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "lea " MEMLEA(0x10,0) ",%0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklbw %%xmm2,%%xmm0 \n" + "punpckhbw %%xmm2,%%xmm1 \n" + "movdqu %%xmm0," MEMACCESS(3) " \n" + "movdqu %%xmm1," MEMACCESS2(0x10,3) " \n" + "lea " MEMLEA(0x20,3) ",%3 \n" + "sub $0x10,%4 \n" + "jg 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_frame), // %3 + "+rm"(width) // %4 + : + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3" +#endif + ); +} +#endif // HAS_I422TOYUY2ROW_SSE2 + +#ifdef HAS_I422TOUYVYROW_SSE2 +void I422ToUYVYRow_SSE2(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_frame, int width) { + asm volatile ( + "sub %1,%2 \n" + LABELALIGN + "1: \n" + "movq " MEMACCESS(1) ",%%xmm2 \n" + MEMOPREG(movq,0x00,1,2,1,xmm3) // movq (%1,%2,1),%%xmm3 + "lea " MEMLEA(0x8,1) ",%1 \n" + "punpcklbw %%xmm3,%%xmm2 \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqa %%xmm2,%%xmm1 \n" + "lea " MEMLEA(0x10,0) ",%0 \n" + "punpcklbw %%xmm0,%%xmm1 \n" + "punpckhbw %%xmm0,%%xmm2 \n" + "movdqu %%xmm1," MEMACCESS(3) " \n" + "movdqu %%xmm2," MEMACCESS2(0x10,3) " \n" + "lea " MEMLEA(0x20,3) ",%3 \n" + "sub $0x10,%4 \n" + "jg 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_frame), // %3 + "+rm"(width) // %4 + : + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3" +#endif + ); +} +#endif // HAS_I422TOUYVYROW_SSE2 + +#ifdef HAS_ARGBPOLYNOMIALROW_SSE2 +void ARGBPolynomialRow_SSE2(const uint8* src_argb, + uint8* dst_argb, const float* poly, + int width) { + asm volatile ( + "pxor %%xmm3,%%xmm3 \n" + + // 2 pixel loop. + LABELALIGN + "1: \n" + "movq " MEMACCESS(0) ",%%xmm0 \n" + "lea " MEMLEA(0x8,0) ",%0 \n" + "punpcklbw %%xmm3,%%xmm0 \n" + "movdqa %%xmm0,%%xmm4 \n" + "punpcklwd %%xmm3,%%xmm0 \n" + "punpckhwd %%xmm3,%%xmm4 \n" + "cvtdq2ps %%xmm0,%%xmm0 \n" + "cvtdq2ps %%xmm4,%%xmm4 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm4,%%xmm5 \n" + "mulps " MEMACCESS2(0x10,3) ",%%xmm0 \n" + "mulps " MEMACCESS2(0x10,3) ",%%xmm4 \n" + "addps " MEMACCESS(3) ",%%xmm0 \n" + "addps " MEMACCESS(3) ",%%xmm4 \n" + "movdqa %%xmm1,%%xmm2 \n" + "movdqa %%xmm5,%%xmm6 \n" + "mulps %%xmm1,%%xmm2 \n" + "mulps %%xmm5,%%xmm6 \n" + "mulps %%xmm2,%%xmm1 \n" + "mulps %%xmm6,%%xmm5 \n" + "mulps " MEMACCESS2(0x20,3) ",%%xmm2 \n" + "mulps " MEMACCESS2(0x20,3) ",%%xmm6 \n" + "mulps " MEMACCESS2(0x30,3) ",%%xmm1 \n" + "mulps " MEMACCESS2(0x30,3) ",%%xmm5 \n" + "addps %%xmm2,%%xmm0 \n" + "addps %%xmm6,%%xmm4 \n" + "addps %%xmm1,%%xmm0 \n" + "addps %%xmm5,%%xmm4 \n" + "cvttps2dq %%xmm0,%%xmm0 \n" + "cvttps2dq %%xmm4,%%xmm4 \n" + "packuswb %%xmm4,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "sub $0x2,%2 \n" + "movq %%xmm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x8,1) ",%1 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "r"(poly) // %3 + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" +#endif + ); +} +#endif // HAS_ARGBPOLYNOMIALROW_SSE2 + +#ifdef HAS_ARGBPOLYNOMIALROW_AVX2 +void ARGBPolynomialRow_AVX2(const uint8* src_argb, + uint8* dst_argb, const float* poly, + int width) { + asm volatile ( + "vbroadcastf128 " MEMACCESS(3) ",%%ymm4 \n" + "vbroadcastf128 " MEMACCESS2(0x10,3) ",%%ymm5 \n" + "vbroadcastf128 " MEMACCESS2(0x20,3) ",%%ymm6 \n" + "vbroadcastf128 " MEMACCESS2(0x30,3) ",%%ymm7 \n" + + // 2 pixel loop. + LABELALIGN + "1: \n" + "vpmovzxbd " MEMACCESS(0) ",%%ymm0 \n" // 2 ARGB pixels + "lea " MEMLEA(0x8,0) ",%0 \n" + "vcvtdq2ps %%ymm0,%%ymm0 \n" // X 8 floats + "vmulps %%ymm0,%%ymm0,%%ymm2 \n" // X * X + "vmulps %%ymm7,%%ymm0,%%ymm3 \n" // C3 * X + "vfmadd132ps %%ymm5,%%ymm4,%%ymm0 \n" // result = C0 + C1 * X + "vfmadd231ps %%ymm6,%%ymm2,%%ymm0 \n" // result += C2 * X * X + "vfmadd231ps %%ymm3,%%ymm2,%%ymm0 \n" // result += C3 * X * X * X + "vcvttps2dq %%ymm0,%%ymm0 \n" + "vpackusdw %%ymm0,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vpackuswb %%xmm0,%%xmm0,%%xmm0 \n" + "sub $0x2,%2 \n" + "vmovq %%xmm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x8,1) ",%1 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "r"(poly) // %3 + : "memory", "cc" +#if defined(__SSE2__) +// TODO(fbarchard): declare ymm usage when applicable. + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" +#endif + ); +} +#endif // HAS_ARGBPOLYNOMIALROW_AVX2 + +#ifdef HAS_ARGBCOLORTABLEROW_X86 +// Tranform ARGB pixels with color table. +void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, + int width) { + uintptr_t pixel_temp = 0u; + asm volatile ( + // 1 pixel loop. + LABELALIGN + "1: \n" + "movzb " MEMACCESS(0) ",%1 \n" + "lea " MEMLEA(0x4,0) ",%0 \n" + MEMOPARG(movzb,0x00,3,1,4,1) " \n" // movzb (%3,%1,4),%1 + "mov %b1," MEMACCESS2(-0x4,0) " \n" + "movzb " MEMACCESS2(-0x3,0) ",%1 \n" + MEMOPARG(movzb,0x01,3,1,4,1) " \n" // movzb 0x1(%3,%1,4),%1 + "mov %b1," MEMACCESS2(-0x3,0) " \n" + "movzb " MEMACCESS2(-0x2,0) ",%1 \n" + MEMOPARG(movzb,0x02,3,1,4,1) " \n" // movzb 0x2(%3,%1,4),%1 + "mov %b1," MEMACCESS2(-0x2,0) " \n" + "movzb " MEMACCESS2(-0x1,0) ",%1 \n" + MEMOPARG(movzb,0x03,3,1,4,1) " \n" // movzb 0x3(%3,%1,4),%1 + "mov %b1," MEMACCESS2(-0x1,0) " \n" + "dec %2 \n" + "jg 1b \n" + : "+r"(dst_argb), // %0 + "+d"(pixel_temp), // %1 + "+r"(width) // %2 + : "r"(table_argb) // %3 + : "memory", "cc"); +} +#endif // HAS_ARGBCOLORTABLEROW_X86 + +#ifdef HAS_RGBCOLORTABLEROW_X86 +// Tranform RGB pixels with color table. +void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) { + uintptr_t pixel_temp = 0u; + asm volatile ( + // 1 pixel loop. + LABELALIGN + "1: \n" + "movzb " MEMACCESS(0) ",%1 \n" + "lea " MEMLEA(0x4,0) ",%0 \n" + MEMOPARG(movzb,0x00,3,1,4,1) " \n" // movzb (%3,%1,4),%1 + "mov %b1," MEMACCESS2(-0x4,0) " \n" + "movzb " MEMACCESS2(-0x3,0) ",%1 \n" + MEMOPARG(movzb,0x01,3,1,4,1) " \n" // movzb 0x1(%3,%1,4),%1 + "mov %b1," MEMACCESS2(-0x3,0) " \n" + "movzb " MEMACCESS2(-0x2,0) ",%1 \n" + MEMOPARG(movzb,0x02,3,1,4,1) " \n" // movzb 0x2(%3,%1,4),%1 + "mov %b1," MEMACCESS2(-0x2,0) " \n" + "dec %2 \n" + "jg 1b \n" + : "+r"(dst_argb), // %0 + "+d"(pixel_temp), // %1 + "+r"(width) // %2 + : "r"(table_argb) // %3 + : "memory", "cc"); +} +#endif // HAS_RGBCOLORTABLEROW_X86 + +#ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3 +// Tranform RGB pixels with luma table. +void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb, + int width, + const uint8* luma, uint32 lumacoeff) { + uintptr_t pixel_temp = 0u; + uintptr_t table_temp = 0u; + asm volatile ( + "movd %6,%%xmm3 \n" + "pshufd $0x0,%%xmm3,%%xmm3 \n" + "pcmpeqb %%xmm4,%%xmm4 \n" + "psllw $0x8,%%xmm4 \n" + "pxor %%xmm5,%%xmm5 \n" + + // 4 pixel loop. + LABELALIGN + "1: \n" + "movdqu " MEMACCESS(2) ",%%xmm0 \n" + "pmaddubsw %%xmm3,%%xmm0 \n" + "phaddw %%xmm0,%%xmm0 \n" + "pand %%xmm4,%%xmm0 \n" + "punpcklwd %%xmm5,%%xmm0 \n" + "movd %%xmm0,%k1 \n" // 32 bit offset + "add %5,%1 \n" + "pshufd $0x39,%%xmm0,%%xmm0 \n" + + "movzb " MEMACCESS(2) ",%0 \n" + MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0 + "mov %b0," MEMACCESS(3) " \n" + "movzb " MEMACCESS2(0x1,2) ",%0 \n" + MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0 + "mov %b0," MEMACCESS2(0x1,3) " \n" + "movzb " MEMACCESS2(0x2,2) ",%0 \n" + MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0 + "mov %b0," MEMACCESS2(0x2,3) " \n" + "movzb " MEMACCESS2(0x3,2) ",%0 \n" + "mov %b0," MEMACCESS2(0x3,3) " \n" + + "movd %%xmm0,%k1 \n" // 32 bit offset + "add %5,%1 \n" + "pshufd $0x39,%%xmm0,%%xmm0 \n" + + "movzb " MEMACCESS2(0x4,2) ",%0 \n" + MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0 + "mov %b0," MEMACCESS2(0x4,3) " \n" + BUNDLEALIGN + "movzb " MEMACCESS2(0x5,2) ",%0 \n" + MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0 + "mov %b0," MEMACCESS2(0x5,3) " \n" + "movzb " MEMACCESS2(0x6,2) ",%0 \n" + MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0 + "mov %b0," MEMACCESS2(0x6,3) " \n" + "movzb " MEMACCESS2(0x7,2) ",%0 \n" + "mov %b0," MEMACCESS2(0x7,3) " \n" + + "movd %%xmm0,%k1 \n" // 32 bit offset + "add %5,%1 \n" + "pshufd $0x39,%%xmm0,%%xmm0 \n" + + "movzb " MEMACCESS2(0x8,2) ",%0 \n" + MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0 + "mov %b0," MEMACCESS2(0x8,3) " \n" + "movzb " MEMACCESS2(0x9,2) ",%0 \n" + MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0 + "mov %b0," MEMACCESS2(0x9,3) " \n" + "movzb " MEMACCESS2(0xa,2) ",%0 \n" + MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0 + "mov %b0," MEMACCESS2(0xa,3) " \n" + "movzb " MEMACCESS2(0xb,2) ",%0 \n" + "mov %b0," MEMACCESS2(0xb,3) " \n" + + "movd %%xmm0,%k1 \n" // 32 bit offset + "add %5,%1 \n" + + "movzb " MEMACCESS2(0xc,2) ",%0 \n" + MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0 + "mov %b0," MEMACCESS2(0xc,3) " \n" + "movzb " MEMACCESS2(0xd,2) ",%0 \n" + MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0 + "mov %b0," MEMACCESS2(0xd,3) " \n" + "movzb " MEMACCESS2(0xe,2) ",%0 \n" + MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0 + "mov %b0," MEMACCESS2(0xe,3) " \n" + "movzb " MEMACCESS2(0xf,2) ",%0 \n" + "mov %b0," MEMACCESS2(0xf,3) " \n" + "sub $0x4,%4 \n" + "lea " MEMLEA(0x10,2) ",%2 \n" + "lea " MEMLEA(0x10,3) ",%3 \n" + "jg 1b \n" + : "+d"(pixel_temp), // %0 + "+a"(table_temp), // %1 + "+r"(src_argb), // %2 + "+r"(dst_argb), // %3 + "+rm"(width) // %4 + : "r"(luma), // %5 + "rm"(lumacoeff) // %6 + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm3", "xmm4", "xmm5" +#endif + ); +} +#endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3 + +#endif // defined(__x86_64__) || defined(__i386__) + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/TMessagesProj/jni/libyuv/source/row_win.cc b/TMessagesProj/jni/libyuv/source/row_win.cc new file mode 100644 index 000000000..f58fc5138 --- /dev/null +++ b/TMessagesProj/jni/libyuv/source/row_win.cc @@ -0,0 +1,7402 @@ +/* + * Copyright 2011 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/row.h" + +#if defined (_M_X64) && !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) +#include +#include // For _mm_maddubs_epi16 +#endif + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// This module is for Visual C. +#if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) + +#define YG 74 /* (int8)(1.164 * 64 + 0.5) */ + +#define UB 127 /* min(127,(int8)(2.018 * 64)) */ +#define UG -25 /* (int8)(-0.391 * 64 - 0.5) */ +#define UR 0 + +#define VB 0 +#define VG -52 /* (int8)(-0.813 * 64 - 0.5) */ +#define VR 102 /* (int8)(1.596 * 64 + 0.5) */ + +// Bias +#define BB UB * 128 + VB * 128 +#define BG UG * 128 + VG * 128 +#define BR UR * 128 + VR * 128 + +static const vec8 kUVToB = { + UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB +}; + +static const vec8 kUVToR = { + UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR +}; + +static const vec8 kUVToG = { + UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG +}; + +static const vec8 kVUToB = { + VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, +}; + +static const vec8 kVUToR = { + VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, +}; + +static const vec8 kVUToG = { + VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, +}; + +static const vec16 kYToRgb = { YG, YG, YG, YG, YG, YG, YG, YG }; +static const vec16 kYSub16 = { 16, 16, 16, 16, 16, 16, 16, 16 }; +static const vec16 kUVBiasB = { BB, BB, BB, BB, BB, BB, BB, BB }; +static const vec16 kUVBiasG = { BG, BG, BG, BG, BG, BG, BG, BG }; +static const vec16 kUVBiasR = { BR, BR, BR, BR, BR, BR, BR, BR }; + +// 64 bit +#if defined(_M_X64) + +// Aligned destination version. +__declspec(align(16)) +void I422ToARGBRow_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* dst_argb, + int width) { + __m128i xmm0, xmm1, xmm2, xmm3; + const __m128i xmm5 = _mm_set1_epi8(-1); + const __m128i xmm4 = _mm_setzero_si128(); + const ptrdiff_t offset = (uint8*)v_buf - (uint8*)u_buf; + + while (width > 0) { + xmm0 = _mm_cvtsi32_si128(*(uint32*)u_buf); + xmm1 = _mm_cvtsi32_si128(*(uint32*)(u_buf + offset)); + xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); + xmm0 = _mm_unpacklo_epi16(xmm0, xmm0); + xmm1 = _mm_load_si128(&xmm0); + xmm2 = _mm_load_si128(&xmm0); + xmm0 = _mm_maddubs_epi16(xmm0, *(__m128i*)kUVToB); + xmm1 = _mm_maddubs_epi16(xmm1, *(__m128i*)kUVToG); + xmm2 = _mm_maddubs_epi16(xmm2, *(__m128i*)kUVToR); + xmm0 = _mm_sub_epi16(xmm0, *(__m128i*)kUVBiasB); + xmm1 = _mm_sub_epi16(xmm1, *(__m128i*)kUVBiasG); + xmm2 = _mm_sub_epi16(xmm2, *(__m128i*)kUVBiasR); + xmm3 = _mm_loadl_epi64((__m128i*)y_buf); + xmm3 = _mm_unpacklo_epi8(xmm3, xmm4); + xmm3 = _mm_subs_epi16(xmm3, *(__m128i*)kYSub16); + xmm3 = _mm_mullo_epi16(xmm3, *(__m128i*)kYToRgb); + xmm0 = _mm_adds_epi16(xmm0, xmm3); + xmm1 = _mm_adds_epi16(xmm1, xmm3); + xmm2 = _mm_adds_epi16(xmm2, xmm3); + xmm0 = _mm_srai_epi16(xmm0, 6); + xmm1 = _mm_srai_epi16(xmm1, 6); + xmm2 = _mm_srai_epi16(xmm2, 6); + xmm0 = _mm_packus_epi16(xmm0, xmm0); + xmm1 = _mm_packus_epi16(xmm1, xmm1); + xmm2 = _mm_packus_epi16(xmm2, xmm2); + xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); + xmm2 = _mm_unpacklo_epi8(xmm2, xmm5); + xmm1 = _mm_load_si128(&xmm0); + xmm0 = _mm_unpacklo_epi16(xmm0, xmm2); + xmm1 = _mm_unpackhi_epi16(xmm1, xmm2); + + _mm_store_si128((__m128i *)dst_argb, xmm0); + _mm_store_si128((__m128i *)(dst_argb + 16), xmm1); + + y_buf += 8; + u_buf += 4; + dst_argb += 32; + width -= 8; + } +} + +// Unaligned destination version. +void I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* dst_argb, + int width) { + __m128i xmm0, xmm1, xmm2, xmm3; + const __m128i xmm5 = _mm_set1_epi8(-1); + const __m128i xmm4 = _mm_setzero_si128(); + const ptrdiff_t offset = (uint8*)v_buf - (uint8*)u_buf; + + while (width > 0) { + xmm0 = _mm_cvtsi32_si128(*(uint32*)u_buf); + xmm1 = _mm_cvtsi32_si128(*(uint32*)(u_buf + offset)); + xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); + xmm0 = _mm_unpacklo_epi16(xmm0, xmm0); + xmm1 = _mm_load_si128(&xmm0); + xmm2 = _mm_load_si128(&xmm0); + xmm0 = _mm_maddubs_epi16(xmm0, *(__m128i*)kUVToB); + xmm1 = _mm_maddubs_epi16(xmm1, *(__m128i*)kUVToG); + xmm2 = _mm_maddubs_epi16(xmm2, *(__m128i*)kUVToR); + xmm0 = _mm_sub_epi16(xmm0, *(__m128i*)kUVBiasB); + xmm1 = _mm_sub_epi16(xmm1, *(__m128i*)kUVBiasG); + xmm2 = _mm_sub_epi16(xmm2, *(__m128i*)kUVBiasR); + xmm3 = _mm_loadl_epi64((__m128i*)y_buf); + xmm3 = _mm_unpacklo_epi8(xmm3, xmm4); + xmm3 = _mm_subs_epi16(xmm3, *(__m128i*)kYSub16); + xmm3 = _mm_mullo_epi16(xmm3, *(__m128i*)kYToRgb); + xmm0 = _mm_adds_epi16(xmm0, xmm3); + xmm1 = _mm_adds_epi16(xmm1, xmm3); + xmm2 = _mm_adds_epi16(xmm2, xmm3); + xmm0 = _mm_srai_epi16(xmm0, 6); + xmm1 = _mm_srai_epi16(xmm1, 6); + xmm2 = _mm_srai_epi16(xmm2, 6); + xmm0 = _mm_packus_epi16(xmm0, xmm0); + xmm1 = _mm_packus_epi16(xmm1, xmm1); + xmm2 = _mm_packus_epi16(xmm2, xmm2); + xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); + xmm2 = _mm_unpacklo_epi8(xmm2, xmm5); + xmm1 = _mm_load_si128(&xmm0); + xmm0 = _mm_unpacklo_epi16(xmm0, xmm2); + xmm1 = _mm_unpackhi_epi16(xmm1, xmm2); + + _mm_storeu_si128((__m128i *)dst_argb, xmm0); + _mm_storeu_si128((__m128i *)(dst_argb + 16), xmm1); + + y_buf += 8; + u_buf += 4; + dst_argb += 32; + width -= 8; + } +} +// 32 bit +#else // defined(_M_X64) + +#ifdef HAS_ARGBTOYROW_SSSE3 + +// Constants for ARGB. +static const vec8 kARGBToY = { + 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0 +}; + +// JPeg full range. +static const vec8 kARGBToYJ = { + 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0 +}; + +static const vec8 kARGBToU = { + 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0 +}; + +static const vec8 kARGBToUJ = { + 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0 +}; + +static const vec8 kARGBToV = { + -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, +}; + +static const vec8 kARGBToVJ = { + -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0 +}; + +// vpermd for vphaddw + vpackuswb vpermd. +static const lvec32 kPermdARGBToY_AVX = { + 0, 4, 1, 5, 2, 6, 3, 7 +}; + +// vpshufb for vphaddw + vpackuswb packed to shorts. +static const lvec8 kShufARGBToUV_AVX = { + 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15, + 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15, +}; + +// Constants for BGRA. +static const vec8 kBGRAToY = { + 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13 +}; + +static const vec8 kBGRAToU = { + 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112 +}; + +static const vec8 kBGRAToV = { + 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18 +}; + +// Constants for ABGR. +static const vec8 kABGRToY = { + 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0 +}; + +static const vec8 kABGRToU = { + -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0 +}; + +static const vec8 kABGRToV = { + 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0 +}; + +// Constants for RGBA. +static const vec8 kRGBAToY = { + 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33 +}; + +static const vec8 kRGBAToU = { + 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38 +}; + +static const vec8 kRGBAToV = { + 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112 +}; + +static const uvec8 kAddY16 = { + 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u +}; + +static const vec16 kAddYJ64 = { + 64, 64, 64, 64, 64, 64, 64, 64 +}; + +static const uvec8 kAddUV128 = { + 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, + 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u +}; + +static const uvec16 kAddUVJ128 = { + 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u +}; + +// Shuffle table for converting RGB24 to ARGB. +static const uvec8 kShuffleMaskRGB24ToARGB = { + 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u +}; + +// Shuffle table for converting RAW to ARGB. +static const uvec8 kShuffleMaskRAWToARGB = { + 2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u +}; + +// Shuffle table for converting ARGB to RGB24. +static const uvec8 kShuffleMaskARGBToRGB24 = { + 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u +}; + +// Shuffle table for converting ARGB to RAW. +static const uvec8 kShuffleMaskARGBToRAW = { + 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u +}; + +// Shuffle table for converting ARGBToRGB24 for I422ToRGB24. First 8 + next 4 +static const uvec8 kShuffleMaskARGBToRGB24_0 = { + 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u +}; + +// Shuffle table for converting ARGB to RAW. +static const uvec8 kShuffleMaskARGBToRAW_0 = { + 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 128u, 128u, 128u, 128u, 8u, 14u, 13u, 12u +}; + +// Duplicates gray value 3 times and fills in alpha opaque. +__declspec(naked) __declspec(align(16)) +void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) { + __asm { + mov eax, [esp + 4] // src_y + mov edx, [esp + 8] // dst_argb + mov ecx, [esp + 12] // pix + pcmpeqb xmm5, xmm5 // generate mask 0xff000000 + pslld xmm5, 24 + + align 4 + convertloop: + movq xmm0, qword ptr [eax] + lea eax, [eax + 8] + punpcklbw xmm0, xmm0 + movdqa xmm1, xmm0 + punpcklwd xmm0, xmm0 + punpckhwd xmm1, xmm1 + por xmm0, xmm5 + por xmm1, xmm5 + movdqa [edx], xmm0 + movdqa [edx + 16], xmm1 + lea edx, [edx + 32] + sub ecx, 8 + jg convertloop + ret + } +} + +__declspec(naked) __declspec(align(16)) +void I400ToARGBRow_Unaligned_SSE2(const uint8* src_y, uint8* dst_argb, + int pix) { + __asm { + mov eax, [esp + 4] // src_y + mov edx, [esp + 8] // dst_argb + mov ecx, [esp + 12] // pix + pcmpeqb xmm5, xmm5 // generate mask 0xff000000 + pslld xmm5, 24 + + align 4 + convertloop: + movq xmm0, qword ptr [eax] + lea eax, [eax + 8] + punpcklbw xmm0, xmm0 + movdqa xmm1, xmm0 + punpcklwd xmm0, xmm0 + punpckhwd xmm1, xmm1 + por xmm0, xmm5 + por xmm1, xmm5 + movdqu [edx], xmm0 + movdqu [edx + 16], xmm1 + lea edx, [edx + 32] + sub ecx, 8 + jg convertloop + ret + } +} + +__declspec(naked) __declspec(align(16)) +void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) { + __asm { + mov eax, [esp + 4] // src_rgb24 + mov edx, [esp + 8] // dst_argb + mov ecx, [esp + 12] // pix + pcmpeqb xmm5, xmm5 // generate mask 0xff000000 + pslld xmm5, 24 + movdqa xmm4, kShuffleMaskRGB24ToARGB + + align 4 + convertloop: + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + movdqu xmm3, [eax + 32] + lea eax, [eax + 48] + movdqa xmm2, xmm3 + palignr xmm2, xmm1, 8 // xmm2 = { xmm3[0:3] xmm1[8:15]} + pshufb xmm2, xmm4 + por xmm2, xmm5 + palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]} + pshufb xmm0, xmm4 + movdqa [edx + 32], xmm2 + por xmm0, xmm5 + pshufb xmm1, xmm4 + movdqa [edx], xmm0 + por xmm1, xmm5 + palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]} + pshufb xmm3, xmm4 + movdqa [edx + 16], xmm1 + por xmm3, xmm5 + sub ecx, 16 + movdqa [edx + 48], xmm3 + lea edx, [edx + 64] + jg convertloop + ret + } +} + +__declspec(naked) __declspec(align(16)) +void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, + int pix) { + __asm { + mov eax, [esp + 4] // src_raw + mov edx, [esp + 8] // dst_argb + mov ecx, [esp + 12] // pix + pcmpeqb xmm5, xmm5 // generate mask 0xff000000 + pslld xmm5, 24 + movdqa xmm4, kShuffleMaskRAWToARGB + + align 4 + convertloop: + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + movdqu xmm3, [eax + 32] + lea eax, [eax + 48] + movdqa xmm2, xmm3 + palignr xmm2, xmm1, 8 // xmm2 = { xmm3[0:3] xmm1[8:15]} + pshufb xmm2, xmm4 + por xmm2, xmm5 + palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]} + pshufb xmm0, xmm4 + movdqa [edx + 32], xmm2 + por xmm0, xmm5 + pshufb xmm1, xmm4 + movdqa [edx], xmm0 + por xmm1, xmm5 + palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]} + pshufb xmm3, xmm4 + movdqa [edx + 16], xmm1 + por xmm3, xmm5 + sub ecx, 16 + movdqa [edx + 48], xmm3 + lea edx, [edx + 64] + jg convertloop + ret + } +} + +// pmul method to replicate bits. +// Math to replicate bits: +// (v << 8) | (v << 3) +// v * 256 + v * 8 +// v * (256 + 8) +// G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3 +// 20 instructions. +__declspec(naked) __declspec(align(16)) +void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb, + int pix) { + __asm { + mov eax, 0x01080108 // generate multiplier to repeat 5 bits + movd xmm5, eax + pshufd xmm5, xmm5, 0 + mov eax, 0x20802080 // multiplier shift by 5 and then repeat 6 bits + movd xmm6, eax + pshufd xmm6, xmm6, 0 + pcmpeqb xmm3, xmm3 // generate mask 0xf800f800 for Red + psllw xmm3, 11 + pcmpeqb xmm4, xmm4 // generate mask 0x07e007e0 for Green + psllw xmm4, 10 + psrlw xmm4, 5 + pcmpeqb xmm7, xmm7 // generate mask 0xff00ff00 for Alpha + psllw xmm7, 8 + + mov eax, [esp + 4] // src_rgb565 + mov edx, [esp + 8] // dst_argb + mov ecx, [esp + 12] // pix + sub edx, eax + sub edx, eax + + align 4 + convertloop: + movdqu xmm0, [eax] // fetch 8 pixels of bgr565 + movdqa xmm1, xmm0 + movdqa xmm2, xmm0 + pand xmm1, xmm3 // R in upper 5 bits + psllw xmm2, 11 // B in upper 5 bits + pmulhuw xmm1, xmm5 // * (256 + 8) + pmulhuw xmm2, xmm5 // * (256 + 8) + psllw xmm1, 8 + por xmm1, xmm2 // RB + pand xmm0, xmm4 // G in middle 6 bits + pmulhuw xmm0, xmm6 // << 5 * (256 + 4) + por xmm0, xmm7 // AG + movdqa xmm2, xmm1 + punpcklbw xmm1, xmm0 + punpckhbw xmm2, xmm0 + movdqa [eax * 2 + edx], xmm1 // store 4 pixels of ARGB + movdqa [eax * 2 + edx + 16], xmm2 // store next 4 pixels of ARGB + lea eax, [eax + 16] + sub ecx, 8 + jg convertloop + ret + } +} + +// 24 instructions +__declspec(naked) __declspec(align(16)) +void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb, + int pix) { + __asm { + mov eax, 0x01080108 // generate multiplier to repeat 5 bits + movd xmm5, eax + pshufd xmm5, xmm5, 0 + mov eax, 0x42004200 // multiplier shift by 6 and then repeat 5 bits + movd xmm6, eax + pshufd xmm6, xmm6, 0 + pcmpeqb xmm3, xmm3 // generate mask 0xf800f800 for Red + psllw xmm3, 11 + movdqa xmm4, xmm3 // generate mask 0x03e003e0 for Green + psrlw xmm4, 6 + pcmpeqb xmm7, xmm7 // generate mask 0xff00ff00 for Alpha + psllw xmm7, 8 + + mov eax, [esp + 4] // src_argb1555 + mov edx, [esp + 8] // dst_argb + mov ecx, [esp + 12] // pix + sub edx, eax + sub edx, eax + + align 4 + convertloop: + movdqu xmm0, [eax] // fetch 8 pixels of 1555 + movdqa xmm1, xmm0 + movdqa xmm2, xmm0 + psllw xmm1, 1 // R in upper 5 bits + psllw xmm2, 11 // B in upper 5 bits + pand xmm1, xmm3 + pmulhuw xmm2, xmm5 // * (256 + 8) + pmulhuw xmm1, xmm5 // * (256 + 8) + psllw xmm1, 8 + por xmm1, xmm2 // RB + movdqa xmm2, xmm0 + pand xmm0, xmm4 // G in middle 5 bits + psraw xmm2, 8 // A + pmulhuw xmm0, xmm6 // << 6 * (256 + 8) + pand xmm2, xmm7 + por xmm0, xmm2 // AG + movdqa xmm2, xmm1 + punpcklbw xmm1, xmm0 + punpckhbw xmm2, xmm0 + movdqa [eax * 2 + edx], xmm1 // store 4 pixels of ARGB + movdqa [eax * 2 + edx + 16], xmm2 // store next 4 pixels of ARGB + lea eax, [eax + 16] + sub ecx, 8 + jg convertloop + ret + } +} + +// 18 instructions. +__declspec(naked) __declspec(align(16)) +void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb, + int pix) { + __asm { + mov eax, 0x0f0f0f0f // generate mask 0x0f0f0f0f + movd xmm4, eax + pshufd xmm4, xmm4, 0 + movdqa xmm5, xmm4 // 0xf0f0f0f0 for high nibbles + pslld xmm5, 4 + mov eax, [esp + 4] // src_argb4444 + mov edx, [esp + 8] // dst_argb + mov ecx, [esp + 12] // pix + sub edx, eax + sub edx, eax + + align 4 + convertloop: + movdqu xmm0, [eax] // fetch 8 pixels of bgra4444 + movdqa xmm2, xmm0 + pand xmm0, xmm4 // mask low nibbles + pand xmm2, xmm5 // mask high nibbles + movdqa xmm1, xmm0 + movdqa xmm3, xmm2 + psllw xmm1, 4 + psrlw xmm3, 4 + por xmm0, xmm1 + por xmm2, xmm3 + movdqa xmm1, xmm0 + punpcklbw xmm0, xmm2 + punpckhbw xmm1, xmm2 + movdqa [eax * 2 + edx], xmm0 // store 4 pixels of ARGB + movdqa [eax * 2 + edx + 16], xmm1 // store next 4 pixels of ARGB + lea eax, [eax + 16] + sub ecx, 8 + jg convertloop + ret + } +} + +__declspec(naked) __declspec(align(16)) +void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) { + __asm { + mov eax, [esp + 4] // src_argb + mov edx, [esp + 8] // dst_rgb + mov ecx, [esp + 12] // pix + movdqa xmm6, kShuffleMaskARGBToRGB24 + + align 4 + convertloop: + movdqu xmm0, [eax] // fetch 16 pixels of argb + movdqu xmm1, [eax + 16] + movdqu xmm2, [eax + 32] + movdqu xmm3, [eax + 48] + lea eax, [eax + 64] + pshufb xmm0, xmm6 // pack 16 bytes of ARGB to 12 bytes of RGB + pshufb xmm1, xmm6 + pshufb xmm2, xmm6 + pshufb xmm3, xmm6 + movdqa xmm4, xmm1 // 4 bytes from 1 for 0 + psrldq xmm1, 4 // 8 bytes from 1 + pslldq xmm4, 12 // 4 bytes from 1 for 0 + movdqa xmm5, xmm2 // 8 bytes from 2 for 1 + por xmm0, xmm4 // 4 bytes from 1 for 0 + pslldq xmm5, 8 // 8 bytes from 2 for 1 + movdqu [edx], xmm0 // store 0 + por xmm1, xmm5 // 8 bytes from 2 for 1 + psrldq xmm2, 8 // 4 bytes from 2 + pslldq xmm3, 4 // 12 bytes from 3 for 2 + por xmm2, xmm3 // 12 bytes from 3 for 2 + movdqu [edx + 16], xmm1 // store 1 + movdqu [edx + 32], xmm2 // store 2 + lea edx, [edx + 48] + sub ecx, 16 + jg convertloop + ret + } +} + +__declspec(naked) __declspec(align(16)) +void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) { + __asm { + mov eax, [esp + 4] // src_argb + mov edx, [esp + 8] // dst_rgb + mov ecx, [esp + 12] // pix + movdqa xmm6, kShuffleMaskARGBToRAW + + align 4 + convertloop: + movdqu xmm0, [eax] // fetch 16 pixels of argb + movdqu xmm1, [eax + 16] + movdqu xmm2, [eax + 32] + movdqu xmm3, [eax + 48] + lea eax, [eax + 64] + pshufb xmm0, xmm6 // pack 16 bytes of ARGB to 12 bytes of RGB + pshufb xmm1, xmm6 + pshufb xmm2, xmm6 + pshufb xmm3, xmm6 + movdqa xmm4, xmm1 // 4 bytes from 1 for 0 + psrldq xmm1, 4 // 8 bytes from 1 + pslldq xmm4, 12 // 4 bytes from 1 for 0 + movdqa xmm5, xmm2 // 8 bytes from 2 for 1 + por xmm0, xmm4 // 4 bytes from 1 for 0 + pslldq xmm5, 8 // 8 bytes from 2 for 1 + movdqu [edx], xmm0 // store 0 + por xmm1, xmm5 // 8 bytes from 2 for 1 + psrldq xmm2, 8 // 4 bytes from 2 + pslldq xmm3, 4 // 12 bytes from 3 for 2 + por xmm2, xmm3 // 12 bytes from 3 for 2 + movdqu [edx + 16], xmm1 // store 1 + movdqu [edx + 32], xmm2 // store 2 + lea edx, [edx + 48] + sub ecx, 16 + jg convertloop + ret + } +} + +__declspec(naked) __declspec(align(16)) +void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) { + __asm { + mov eax, [esp + 4] // src_argb + mov edx, [esp + 8] // dst_rgb + mov ecx, [esp + 12] // pix + pcmpeqb xmm3, xmm3 // generate mask 0x0000001f + psrld xmm3, 27 + pcmpeqb xmm4, xmm4 // generate mask 0x000007e0 + psrld xmm4, 26 + pslld xmm4, 5 + pcmpeqb xmm5, xmm5 // generate mask 0xfffff800 + pslld xmm5, 11 + + align 4 + convertloop: + movdqa xmm0, [eax] // fetch 4 pixels of argb + movdqa xmm1, xmm0 // B + movdqa xmm2, xmm0 // G + pslld xmm0, 8 // R + psrld xmm1, 3 // B + psrld xmm2, 5 // G + psrad xmm0, 16 // R + pand xmm1, xmm3 // B + pand xmm2, xmm4 // G + pand xmm0, xmm5 // R + por xmm1, xmm2 // BG + por xmm0, xmm1 // BGR + packssdw xmm0, xmm0 + lea eax, [eax + 16] + movq qword ptr [edx], xmm0 // store 4 pixels of RGB565 + lea edx, [edx + 8] + sub ecx, 4 + jg convertloop + ret + } +} + +// TODO(fbarchard): Improve sign extension/packing. +__declspec(naked) __declspec(align(16)) +void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) { + __asm { + mov eax, [esp + 4] // src_argb + mov edx, [esp + 8] // dst_rgb + mov ecx, [esp + 12] // pix + pcmpeqb xmm4, xmm4 // generate mask 0x0000001f + psrld xmm4, 27 + movdqa xmm5, xmm4 // generate mask 0x000003e0 + pslld xmm5, 5 + movdqa xmm6, xmm4 // generate mask 0x00007c00 + pslld xmm6, 10 + pcmpeqb xmm7, xmm7 // generate mask 0xffff8000 + pslld xmm7, 15 + + align 4 + convertloop: + movdqa xmm0, [eax] // fetch 4 pixels of argb + movdqa xmm1, xmm0 // B + movdqa xmm2, xmm0 // G + movdqa xmm3, xmm0 // R + psrad xmm0, 16 // A + psrld xmm1, 3 // B + psrld xmm2, 6 // G + psrld xmm3, 9 // R + pand xmm0, xmm7 // A + pand xmm1, xmm4 // B + pand xmm2, xmm5 // G + pand xmm3, xmm6 // R + por xmm0, xmm1 // BA + por xmm2, xmm3 // GR + por xmm0, xmm2 // BGRA + packssdw xmm0, xmm0 + lea eax, [eax + 16] + movq qword ptr [edx], xmm0 // store 4 pixels of ARGB1555 + lea edx, [edx + 8] + sub ecx, 4 + jg convertloop + ret + } +} + +__declspec(naked) __declspec(align(16)) +void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) { + __asm { + mov eax, [esp + 4] // src_argb + mov edx, [esp + 8] // dst_rgb + mov ecx, [esp + 12] // pix + pcmpeqb xmm4, xmm4 // generate mask 0xf000f000 + psllw xmm4, 12 + movdqa xmm3, xmm4 // generate mask 0x00f000f0 + psrlw xmm3, 8 + + align 4 + convertloop: + movdqa xmm0, [eax] // fetch 4 pixels of argb + movdqa xmm1, xmm0 + pand xmm0, xmm3 // low nibble + pand xmm1, xmm4 // high nibble + psrl xmm0, 4 + psrl xmm1, 8 + por xmm0, xmm1 + packuswb xmm0, xmm0 + lea eax, [eax + 16] + movq qword ptr [edx], xmm0 // store 4 pixels of ARGB4444 + lea edx, [edx + 8] + sub ecx, 4 + jg convertloop + ret + } +} + +// Convert 16 ARGB pixels (64 bytes) to 16 Y values. +__declspec(naked) __declspec(align(16)) +void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { + __asm { + mov eax, [esp + 4] /* src_argb */ + mov edx, [esp + 8] /* dst_y */ + mov ecx, [esp + 12] /* pix */ + movdqa xmm5, kAddY16 + movdqa xmm4, kARGBToY + + align 4 + convertloop: + movdqa xmm0, [eax] + movdqa xmm1, [eax + 16] + movdqa xmm2, [eax + 32] + movdqa xmm3, [eax + 48] + pmaddubsw xmm0, xmm4 + pmaddubsw xmm1, xmm4 + pmaddubsw xmm2, xmm4 + pmaddubsw xmm3, xmm4 + lea eax, [eax + 64] + phaddw xmm0, xmm1 + phaddw xmm2, xmm3 + psrlw xmm0, 7 + psrlw xmm2, 7 + packuswb xmm0, xmm2 + paddb xmm0, xmm5 + sub ecx, 16 + movdqa [edx], xmm0 + lea edx, [edx + 16] + jg convertloop + ret + } +} + +// Convert 16 ARGB pixels (64 bytes) to 16 Y values. +__declspec(naked) __declspec(align(16)) +void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { + __asm { + mov eax, [esp + 4] /* src_argb */ + mov edx, [esp + 8] /* dst_y */ + mov ecx, [esp + 12] /* pix */ + movdqa xmm4, kARGBToYJ + movdqa xmm5, kAddYJ64 + + align 4 + convertloop: + movdqa xmm0, [eax] + movdqa xmm1, [eax + 16] + movdqa xmm2, [eax + 32] + movdqa xmm3, [eax + 48] + pmaddubsw xmm0, xmm4 + pmaddubsw xmm1, xmm4 + pmaddubsw xmm2, xmm4 + pmaddubsw xmm3, xmm4 + lea eax, [eax + 64] + phaddw xmm0, xmm1 + phaddw xmm2, xmm3 + paddw xmm0, xmm5 // Add .5 for rounding. + paddw xmm2, xmm5 + psrlw xmm0, 7 + psrlw xmm2, 7 + packuswb xmm0, xmm2 + sub ecx, 16 + movdqa [edx], xmm0 + lea edx, [edx + 16] + jg convertloop + ret + } +} + +#ifdef HAS_ARGBTOYROW_AVX2 +// Convert 32 ARGB pixels (128 bytes) to 32 Y values. +__declspec(naked) __declspec(align(32)) +void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) { + __asm { + mov eax, [esp + 4] /* src_argb */ + mov edx, [esp + 8] /* dst_y */ + mov ecx, [esp + 12] /* pix */ + vbroadcastf128 ymm4, kARGBToY + vbroadcastf128 ymm5, kAddY16 + vmovdqa ymm6, kPermdARGBToY_AVX + + align 4 + convertloop: + vmovdqu ymm0, [eax] + vmovdqu ymm1, [eax + 32] + vmovdqu ymm2, [eax + 64] + vmovdqu ymm3, [eax + 96] + vpmaddubsw ymm0, ymm0, ymm4 + vpmaddubsw ymm1, ymm1, ymm4 + vpmaddubsw ymm2, ymm2, ymm4 + vpmaddubsw ymm3, ymm3, ymm4 + lea eax, [eax + 128] + vphaddw ymm0, ymm0, ymm1 // mutates. + vphaddw ymm2, ymm2, ymm3 + vpsrlw ymm0, ymm0, 7 + vpsrlw ymm2, ymm2, 7 + vpackuswb ymm0, ymm0, ymm2 // mutates. + vpermd ymm0, ymm6, ymm0 // For vphaddw + vpackuswb mutation. + vpaddb ymm0, ymm0, ymm5 + sub ecx, 32 + vmovdqu [edx], ymm0 + lea edx, [edx + 32] + jg convertloop + vzeroupper + ret + } +} +#endif // HAS_ARGBTOYROW_AVX2 + +#ifdef HAS_ARGBTOYROW_AVX2 +// Convert 32 ARGB pixels (128 bytes) to 32 Y values. +__declspec(naked) __declspec(align(32)) +void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) { + __asm { + mov eax, [esp + 4] /* src_argb */ + mov edx, [esp + 8] /* dst_y */ + mov ecx, [esp + 12] /* pix */ + vbroadcastf128 ymm4, kARGBToYJ + vbroadcastf128 ymm5, kAddYJ64 + vmovdqa ymm6, kPermdARGBToY_AVX + + align 4 + convertloop: + vmovdqu ymm0, [eax] + vmovdqu ymm1, [eax + 32] + vmovdqu ymm2, [eax + 64] + vmovdqu ymm3, [eax + 96] + vpmaddubsw ymm0, ymm0, ymm4 + vpmaddubsw ymm1, ymm1, ymm4 + vpmaddubsw ymm2, ymm2, ymm4 + vpmaddubsw ymm3, ymm3, ymm4 + lea eax, [eax + 128] + vphaddw ymm0, ymm0, ymm1 // mutates. + vphaddw ymm2, ymm2, ymm3 + vpaddw ymm0, ymm0, ymm5 // Add .5 for rounding. + vpaddw ymm2, ymm2, ymm5 + vpsrlw ymm0, ymm0, 7 + vpsrlw ymm2, ymm2, 7 + vpackuswb ymm0, ymm0, ymm2 // mutates. + vpermd ymm0, ymm6, ymm0 // For vphaddw + vpackuswb mutation. + sub ecx, 32 + vmovdqu [edx], ymm0 + lea edx, [edx + 32] + jg convertloop + + vzeroupper + ret + } +} +#endif // HAS_ARGBTOYJROW_AVX2 + +__declspec(naked) __declspec(align(16)) +void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { + __asm { + mov eax, [esp + 4] /* src_argb */ + mov edx, [esp + 8] /* dst_y */ + mov ecx, [esp + 12] /* pix */ + movdqa xmm5, kAddY16 + movdqa xmm4, kARGBToY + + align 4 + convertloop: + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + movdqu xmm2, [eax + 32] + movdqu xmm3, [eax + 48] + pmaddubsw xmm0, xmm4 + pmaddubsw xmm1, xmm4 + pmaddubsw xmm2, xmm4 + pmaddubsw xmm3, xmm4 + lea eax, [eax + 64] + phaddw xmm0, xmm1 + phaddw xmm2, xmm3 + psrlw xmm0, 7 + psrlw xmm2, 7 + packuswb xmm0, xmm2 + paddb xmm0, xmm5 + sub ecx, 16 + movdqu [edx], xmm0 + lea edx, [edx + 16] + jg convertloop + ret + } +} + +__declspec(naked) __declspec(align(16)) +void ARGBToYJRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { + __asm { + mov eax, [esp + 4] /* src_argb */ + mov edx, [esp + 8] /* dst_y */ + mov ecx, [esp + 12] /* pix */ + movdqa xmm4, kARGBToYJ + movdqa xmm5, kAddYJ64 + + align 4 + convertloop: + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + movdqu xmm2, [eax + 32] + movdqu xmm3, [eax + 48] + pmaddubsw xmm0, xmm4 + pmaddubsw xmm1, xmm4 + pmaddubsw xmm2, xmm4 + pmaddubsw xmm3, xmm4 + lea eax, [eax + 64] + phaddw xmm0, xmm1 + phaddw xmm2, xmm3 + paddw xmm0, xmm5 + paddw xmm2, xmm5 + psrlw xmm0, 7 + psrlw xmm2, 7 + packuswb xmm0, xmm2 + sub ecx, 16 + movdqu [edx], xmm0 + lea edx, [edx + 16] + jg convertloop + ret + } +} + +__declspec(naked) __declspec(align(16)) +void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { + __asm { + mov eax, [esp + 4] /* src_argb */ + mov edx, [esp + 8] /* dst_y */ + mov ecx, [esp + 12] /* pix */ + movdqa xmm5, kAddY16 + movdqa xmm4, kBGRAToY + + align 4 + convertloop: + movdqa xmm0, [eax] + movdqa xmm1, [eax + 16] + movdqa xmm2, [eax + 32] + movdqa xmm3, [eax + 48] + pmaddubsw xmm0, xmm4 + pmaddubsw xmm1, xmm4 + pmaddubsw xmm2, xmm4 + pmaddubsw xmm3, xmm4 + lea eax, [eax + 64] + phaddw xmm0, xmm1 + phaddw xmm2, xmm3 + psrlw xmm0, 7 + psrlw xmm2, 7 + packuswb xmm0, xmm2 + paddb xmm0, xmm5 + sub ecx, 16 + movdqa [edx], xmm0 + lea edx, [edx + 16] + jg convertloop + ret + } +} + +__declspec(naked) __declspec(align(16)) +void BGRAToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { + __asm { + mov eax, [esp + 4] /* src_argb */ + mov edx, [esp + 8] /* dst_y */ + mov ecx, [esp + 12] /* pix */ + movdqa xmm5, kAddY16 + movdqa xmm4, kBGRAToY + + align 4 + convertloop: + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + movdqu xmm2, [eax + 32] + movdqu xmm3, [eax + 48] + pmaddubsw xmm0, xmm4 + pmaddubsw xmm1, xmm4 + pmaddubsw xmm2, xmm4 + pmaddubsw xmm3, xmm4 + lea eax, [eax + 64] + phaddw xmm0, xmm1 + phaddw xmm2, xmm3 + psrlw xmm0, 7 + psrlw xmm2, 7 + packuswb xmm0, xmm2 + paddb xmm0, xmm5 + sub ecx, 16 + movdqu [edx], xmm0 + lea edx, [edx + 16] + jg convertloop + ret + } +} + +__declspec(naked) __declspec(align(16)) +void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { + __asm { + mov eax, [esp + 4] /* src_argb */ + mov edx, [esp + 8] /* dst_y */ + mov ecx, [esp + 12] /* pix */ + movdqa xmm5, kAddY16 + movdqa xmm4, kABGRToY + + align 4 + convertloop: + movdqa xmm0, [eax] + movdqa xmm1, [eax + 16] + movdqa xmm2, [eax + 32] + movdqa xmm3, [eax + 48] + pmaddubsw xmm0, xmm4 + pmaddubsw xmm1, xmm4 + pmaddubsw xmm2, xmm4 + pmaddubsw xmm3, xmm4 + lea eax, [eax + 64] + phaddw xmm0, xmm1 + phaddw xmm2, xmm3 + psrlw xmm0, 7 + psrlw xmm2, 7 + packuswb xmm0, xmm2 + paddb xmm0, xmm5 + sub ecx, 16 + movdqa [edx], xmm0 + lea edx, [edx + 16] + jg convertloop + ret + } +} + +__declspec(naked) __declspec(align(16)) +void ABGRToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { + __asm { + mov eax, [esp + 4] /* src_argb */ + mov edx, [esp + 8] /* dst_y */ + mov ecx, [esp + 12] /* pix */ + movdqa xmm5, kAddY16 + movdqa xmm4, kABGRToY + + align 4 + convertloop: + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + movdqu xmm2, [eax + 32] + movdqu xmm3, [eax + 48] + pmaddubsw xmm0, xmm4 + pmaddubsw xmm1, xmm4 + pmaddubsw xmm2, xmm4 + pmaddubsw xmm3, xmm4 + lea eax, [eax + 64] + phaddw xmm0, xmm1 + phaddw xmm2, xmm3 + psrlw xmm0, 7 + psrlw xmm2, 7 + packuswb xmm0, xmm2 + paddb xmm0, xmm5 + sub ecx, 16 + movdqu [edx], xmm0 + lea edx, [edx + 16] + jg convertloop + ret + } +} + +__declspec(naked) __declspec(align(16)) +void RGBAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { + __asm { + mov eax, [esp + 4] /* src_argb */ + mov edx, [esp + 8] /* dst_y */ + mov ecx, [esp + 12] /* pix */ + movdqa xmm5, kAddY16 + movdqa xmm4, kRGBAToY + + align 4 + convertloop: + movdqa xmm0, [eax] + movdqa xmm1, [eax + 16] + movdqa xmm2, [eax + 32] + movdqa xmm3, [eax + 48] + pmaddubsw xmm0, xmm4 + pmaddubsw xmm1, xmm4 + pmaddubsw xmm2, xmm4 + pmaddubsw xmm3, xmm4 + lea eax, [eax + 64] + phaddw xmm0, xmm1 + phaddw xmm2, xmm3 + psrlw xmm0, 7 + psrlw xmm2, 7 + packuswb xmm0, xmm2 + paddb xmm0, xmm5 + sub ecx, 16 + movdqa [edx], xmm0 + lea edx, [edx + 16] + jg convertloop + ret + } +} + +__declspec(naked) __declspec(align(16)) +void RGBAToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { + __asm { + mov eax, [esp + 4] /* src_argb */ + mov edx, [esp + 8] /* dst_y */ + mov ecx, [esp + 12] /* pix */ + movdqa xmm5, kAddY16 + movdqa xmm4, kRGBAToY + + align 4 + convertloop: + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + movdqu xmm2, [eax + 32] + movdqu xmm3, [eax + 48] + pmaddubsw xmm0, xmm4 + pmaddubsw xmm1, xmm4 + pmaddubsw xmm2, xmm4 + pmaddubsw xmm3, xmm4 + lea eax, [eax + 64] + phaddw xmm0, xmm1 + phaddw xmm2, xmm3 + psrlw xmm0, 7 + psrlw xmm2, 7 + packuswb xmm0, xmm2 + paddb xmm0, xmm5 + sub ecx, 16 + movdqu [edx], xmm0 + lea edx, [edx + 16] + jg convertloop + ret + } +} + +__declspec(naked) __declspec(align(16)) +void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // src_argb + mov esi, [esp + 8 + 8] // src_stride_argb + mov edx, [esp + 8 + 12] // dst_u + mov edi, [esp + 8 + 16] // dst_v + mov ecx, [esp + 8 + 20] // pix + movdqa xmm7, kARGBToU + movdqa xmm6, kARGBToV + movdqa xmm5, kAddUV128 + sub edi, edx // stride from u to v + + align 4 + convertloop: + /* step 1 - subsample 16x2 argb pixels to 8x1 */ + movdqa xmm0, [eax] + movdqa xmm1, [eax + 16] + movdqa xmm2, [eax + 32] + movdqa xmm3, [eax + 48] + pavgb xmm0, [eax + esi] + pavgb xmm1, [eax + esi + 16] + pavgb xmm2, [eax + esi + 32] + pavgb xmm3, [eax + esi + 48] + lea eax, [eax + 64] + movdqa xmm4, xmm0 + shufps xmm0, xmm1, 0x88 + shufps xmm4, xmm1, 0xdd + pavgb xmm0, xmm4 + movdqa xmm4, xmm2 + shufps xmm2, xmm3, 0x88 + shufps xmm4, xmm3, 0xdd + pavgb xmm2, xmm4 + + // step 2 - convert to U and V + // from here down is very similar to Y code except + // instead of 16 different pixels, its 8 pixels of U and 8 of V + movdqa xmm1, xmm0 + movdqa xmm3, xmm2 + pmaddubsw xmm0, xmm7 // U + pmaddubsw xmm2, xmm7 + pmaddubsw xmm1, xmm6 // V + pmaddubsw xmm3, xmm6 + phaddw xmm0, xmm2 + phaddw xmm1, xmm3 + psraw xmm0, 8 + psraw xmm1, 8 + packsswb xmm0, xmm1 + paddb xmm0, xmm5 // -> unsigned + + // step 3 - store 8 U and 8 V values + sub ecx, 16 + movlps qword ptr [edx], xmm0 // U + movhps qword ptr [edx + edi], xmm0 // V + lea edx, [edx + 8] + jg convertloop + + pop edi + pop esi + ret + } +} + +__declspec(naked) __declspec(align(16)) +void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // src_argb + mov esi, [esp + 8 + 8] // src_stride_argb + mov edx, [esp + 8 + 12] // dst_u + mov edi, [esp + 8 + 16] // dst_v + mov ecx, [esp + 8 + 20] // pix + movdqa xmm7, kARGBToUJ + movdqa xmm6, kARGBToVJ + movdqa xmm5, kAddUVJ128 + sub edi, edx // stride from u to v + + align 4 + convertloop: + /* step 1 - subsample 16x2 argb pixels to 8x1 */ + movdqa xmm0, [eax] + movdqa xmm1, [eax + 16] + movdqa xmm2, [eax + 32] + movdqa xmm3, [eax + 48] + pavgb xmm0, [eax + esi] + pavgb xmm1, [eax + esi + 16] + pavgb xmm2, [eax + esi + 32] + pavgb xmm3, [eax + esi + 48] + lea eax, [eax + 64] + movdqa xmm4, xmm0 + shufps xmm0, xmm1, 0x88 + shufps xmm4, xmm1, 0xdd + pavgb xmm0, xmm4 + movdqa xmm4, xmm2 + shufps xmm2, xmm3, 0x88 + shufps xmm4, xmm3, 0xdd + pavgb xmm2, xmm4 + + // step 2 - convert to U and V + // from here down is very similar to Y code except + // instead of 16 different pixels, its 8 pixels of U and 8 of V + movdqa xmm1, xmm0 + movdqa xmm3, xmm2 + pmaddubsw xmm0, xmm7 // U + pmaddubsw xmm2, xmm7 + pmaddubsw xmm1, xmm6 // V + pmaddubsw xmm3, xmm6 + phaddw xmm0, xmm2 + phaddw xmm1, xmm3 + paddw xmm0, xmm5 // +.5 rounding -> unsigned + paddw xmm1, xmm5 + psraw xmm0, 8 + psraw xmm1, 8 + packsswb xmm0, xmm1 + + // step 3 - store 8 U and 8 V values + sub ecx, 16 + movlps qword ptr [edx], xmm0 // U + movhps qword ptr [edx + edi], xmm0 // V + lea edx, [edx + 8] + jg convertloop + + pop edi + pop esi + ret + } +} + +#ifdef HAS_ARGBTOUVROW_AVX2 +__declspec(naked) __declspec(align(32)) +void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // src_argb + mov esi, [esp + 8 + 8] // src_stride_argb + mov edx, [esp + 8 + 12] // dst_u + mov edi, [esp + 8 + 16] // dst_v + mov ecx, [esp + 8 + 20] // pix + vbroadcastf128 ymm5, kAddUV128 + vbroadcastf128 ymm6, kARGBToV + vbroadcastf128 ymm7, kARGBToU + sub edi, edx // stride from u to v + + align 4 + convertloop: + /* step 1 - subsample 32x2 argb pixels to 16x1 */ + vmovdqu ymm0, [eax] + vmovdqu ymm1, [eax + 32] + vmovdqu ymm2, [eax + 64] + vmovdqu ymm3, [eax + 96] + vpavgb ymm0, ymm0, [eax + esi] + vpavgb ymm1, ymm1, [eax + esi + 32] + vpavgb ymm2, ymm2, [eax + esi + 64] + vpavgb ymm3, ymm3, [eax + esi + 96] + lea eax, [eax + 128] + vshufps ymm4, ymm0, ymm1, 0x88 + vshufps ymm0, ymm0, ymm1, 0xdd + vpavgb ymm0, ymm0, ymm4 // mutated by vshufps + vshufps ymm4, ymm2, ymm3, 0x88 + vshufps ymm2, ymm2, ymm3, 0xdd + vpavgb ymm2, ymm2, ymm4 // mutated by vshufps + + // step 2 - convert to U and V + // from here down is very similar to Y code except + // instead of 32 different pixels, its 16 pixels of U and 16 of V + vpmaddubsw ymm1, ymm0, ymm7 // U + vpmaddubsw ymm3, ymm2, ymm7 + vpmaddubsw ymm0, ymm0, ymm6 // V + vpmaddubsw ymm2, ymm2, ymm6 + vphaddw ymm1, ymm1, ymm3 // mutates + vphaddw ymm0, ymm0, ymm2 + vpsraw ymm1, ymm1, 8 + vpsraw ymm0, ymm0, 8 + vpacksswb ymm0, ymm1, ymm0 // mutates + vpermq ymm0, ymm0, 0xd8 // For vpacksswb + vpshufb ymm0, ymm0, kShufARGBToUV_AVX // For vshufps + vphaddw + vpaddb ymm0, ymm0, ymm5 // -> unsigned + + // step 3 - store 16 U and 16 V values + sub ecx, 32 + vextractf128 [edx], ymm0, 0 // U + vextractf128 [edx + edi], ymm0, 1 // V + lea edx, [edx + 16] + jg convertloop + + pop edi + pop esi + vzeroupper + ret + } +} +#endif // HAS_ARGBTOUVROW_AVX2 + +__declspec(naked) __declspec(align(16)) +void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // src_argb + mov esi, [esp + 8 + 8] // src_stride_argb + mov edx, [esp + 8 + 12] // dst_u + mov edi, [esp + 8 + 16] // dst_v + mov ecx, [esp + 8 + 20] // pix + movdqa xmm7, kARGBToU + movdqa xmm6, kARGBToV + movdqa xmm5, kAddUV128 + sub edi, edx // stride from u to v + + align 4 + convertloop: + /* step 1 - subsample 16x2 argb pixels to 8x1 */ + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + movdqu xmm2, [eax + 32] + movdqu xmm3, [eax + 48] + movdqu xmm4, [eax + esi] + pavgb xmm0, xmm4 + movdqu xmm4, [eax + esi + 16] + pavgb xmm1, xmm4 + movdqu xmm4, [eax + esi + 32] + pavgb xmm2, xmm4 + movdqu xmm4, [eax + esi + 48] + pavgb xmm3, xmm4 + lea eax, [eax + 64] + movdqa xmm4, xmm0 + shufps xmm0, xmm1, 0x88 + shufps xmm4, xmm1, 0xdd + pavgb xmm0, xmm4 + movdqa xmm4, xmm2 + shufps xmm2, xmm3, 0x88 + shufps xmm4, xmm3, 0xdd + pavgb xmm2, xmm4 + + // step 2 - convert to U and V + // from here down is very similar to Y code except + // instead of 16 different pixels, its 8 pixels of U and 8 of V + movdqa xmm1, xmm0 + movdqa xmm3, xmm2 + pmaddubsw xmm0, xmm7 // U + pmaddubsw xmm2, xmm7 + pmaddubsw xmm1, xmm6 // V + pmaddubsw xmm3, xmm6 + phaddw xmm0, xmm2 + phaddw xmm1, xmm3 + psraw xmm0, 8 + psraw xmm1, 8 + packsswb xmm0, xmm1 + paddb xmm0, xmm5 // -> unsigned + + // step 3 - store 8 U and 8 V values + sub ecx, 16 + movlps qword ptr [edx], xmm0 // U + movhps qword ptr [edx + edi], xmm0 // V + lea edx, [edx + 8] + jg convertloop + + pop edi + pop esi + ret + } +} + +__declspec(naked) __declspec(align(16)) +void ARGBToUVJRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // src_argb + mov esi, [esp + 8 + 8] // src_stride_argb + mov edx, [esp + 8 + 12] // dst_u + mov edi, [esp + 8 + 16] // dst_v + mov ecx, [esp + 8 + 20] // pix + movdqa xmm7, kARGBToUJ + movdqa xmm6, kARGBToVJ + movdqa xmm5, kAddUVJ128 + sub edi, edx // stride from u to v + + align 4 + convertloop: + /* step 1 - subsample 16x2 argb pixels to 8x1 */ + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + movdqu xmm2, [eax + 32] + movdqu xmm3, [eax + 48] + movdqu xmm4, [eax + esi] + pavgb xmm0, xmm4 + movdqu xmm4, [eax + esi + 16] + pavgb xmm1, xmm4 + movdqu xmm4, [eax + esi + 32] + pavgb xmm2, xmm4 + movdqu xmm4, [eax + esi + 48] + pavgb xmm3, xmm4 + lea eax, [eax + 64] + movdqa xmm4, xmm0 + shufps xmm0, xmm1, 0x88 + shufps xmm4, xmm1, 0xdd + pavgb xmm0, xmm4 + movdqa xmm4, xmm2 + shufps xmm2, xmm3, 0x88 + shufps xmm4, xmm3, 0xdd + pavgb xmm2, xmm4 + + // step 2 - convert to U and V + // from here down is very similar to Y code except + // instead of 16 different pixels, its 8 pixels of U and 8 of V + movdqa xmm1, xmm0 + movdqa xmm3, xmm2 + pmaddubsw xmm0, xmm7 // U + pmaddubsw xmm2, xmm7 + pmaddubsw xmm1, xmm6 // V + pmaddubsw xmm3, xmm6 + phaddw xmm0, xmm2 + phaddw xmm1, xmm3 + paddw xmm0, xmm5 // +.5 rounding -> unsigned + paddw xmm1, xmm5 + psraw xmm0, 8 + psraw xmm1, 8 + packsswb xmm0, xmm1 + + // step 3 - store 8 U and 8 V values + sub ecx, 16 + movlps qword ptr [edx], xmm0 // U + movhps qword ptr [edx + edi], xmm0 // V + lea edx, [edx + 8] + jg convertloop + + pop edi + pop esi + ret + } +} + +__declspec(naked) __declspec(align(16)) +void ARGBToUV444Row_SSSE3(const uint8* src_argb0, + uint8* dst_u, uint8* dst_v, int width) { + __asm { + push edi + mov eax, [esp + 4 + 4] // src_argb + mov edx, [esp + 4 + 8] // dst_u + mov edi, [esp + 4 + 12] // dst_v + mov ecx, [esp + 4 + 16] // pix + movdqa xmm7, kARGBToU + movdqa xmm6, kARGBToV + movdqa xmm5, kAddUV128 + sub edi, edx // stride from u to v + + align 4 + convertloop: + /* convert to U and V */ + movdqa xmm0, [eax] // U + movdqa xmm1, [eax + 16] + movdqa xmm2, [eax + 32] + movdqa xmm3, [eax + 48] + pmaddubsw xmm0, xmm7 + pmaddubsw xmm1, xmm7 + pmaddubsw xmm2, xmm7 + pmaddubsw xmm3, xmm7 + phaddw xmm0, xmm1 + phaddw xmm2, xmm3 + psraw xmm0, 8 + psraw xmm2, 8 + packsswb xmm0, xmm2 + paddb xmm0, xmm5 + sub ecx, 16 + movdqa [edx], xmm0 + + movdqa xmm0, [eax] // V + movdqa xmm1, [eax + 16] + movdqa xmm2, [eax + 32] + movdqa xmm3, [eax + 48] + pmaddubsw xmm0, xmm6 + pmaddubsw xmm1, xmm6 + pmaddubsw xmm2, xmm6 + pmaddubsw xmm3, xmm6 + phaddw xmm0, xmm1 + phaddw xmm2, xmm3 + psraw xmm0, 8 + psraw xmm2, 8 + packsswb xmm0, xmm2 + paddb xmm0, xmm5 + lea eax, [eax + 64] + movdqa [edx + edi], xmm0 + lea edx, [edx + 16] + jg convertloop + + pop edi + ret + } +} + +__declspec(naked) __declspec(align(16)) +void ARGBToUV444Row_Unaligned_SSSE3(const uint8* src_argb0, + uint8* dst_u, uint8* dst_v, int width) { + __asm { + push edi + mov eax, [esp + 4 + 4] // src_argb + mov edx, [esp + 4 + 8] // dst_u + mov edi, [esp + 4 + 12] // dst_v + mov ecx, [esp + 4 + 16] // pix + movdqa xmm7, kARGBToU + movdqa xmm6, kARGBToV + movdqa xmm5, kAddUV128 + sub edi, edx // stride from u to v + + align 4 + convertloop: + /* convert to U and V */ + movdqu xmm0, [eax] // U + movdqu xmm1, [eax + 16] + movdqu xmm2, [eax + 32] + movdqu xmm3, [eax + 48] + pmaddubsw xmm0, xmm7 + pmaddubsw xmm1, xmm7 + pmaddubsw xmm2, xmm7 + pmaddubsw xmm3, xmm7 + phaddw xmm0, xmm1 + phaddw xmm2, xmm3 + psraw xmm0, 8 + psraw xmm2, 8 + packsswb xmm0, xmm2 + paddb xmm0, xmm5 + sub ecx, 16 + movdqu [edx], xmm0 + + movdqu xmm0, [eax] // V + movdqu xmm1, [eax + 16] + movdqu xmm2, [eax + 32] + movdqu xmm3, [eax + 48] + pmaddubsw xmm0, xmm6 + pmaddubsw xmm1, xmm6 + pmaddubsw xmm2, xmm6 + pmaddubsw xmm3, xmm6 + phaddw xmm0, xmm1 + phaddw xmm2, xmm3 + psraw xmm0, 8 + psraw xmm2, 8 + packsswb xmm0, xmm2 + paddb xmm0, xmm5 + lea eax, [eax + 64] + movdqu [edx + edi], xmm0 + lea edx, [edx + 16] + jg convertloop + + pop edi + ret + } +} + +__declspec(naked) __declspec(align(16)) +void ARGBToUV422Row_SSSE3(const uint8* src_argb0, + uint8* dst_u, uint8* dst_v, int width) { + __asm { + push edi + mov eax, [esp + 4 + 4] // src_argb + mov edx, [esp + 4 + 8] // dst_u + mov edi, [esp + 4 + 12] // dst_v + mov ecx, [esp + 4 + 16] // pix + movdqa xmm7, kARGBToU + movdqa xmm6, kARGBToV + movdqa xmm5, kAddUV128 + sub edi, edx // stride from u to v + + align 4 + convertloop: + /* step 1 - subsample 16x2 argb pixels to 8x1 */ + movdqa xmm0, [eax] + movdqa xmm1, [eax + 16] + movdqa xmm2, [eax + 32] + movdqa xmm3, [eax + 48] + lea eax, [eax + 64] + movdqa xmm4, xmm0 + shufps xmm0, xmm1, 0x88 + shufps xmm4, xmm1, 0xdd + pavgb xmm0, xmm4 + movdqa xmm4, xmm2 + shufps xmm2, xmm3, 0x88 + shufps xmm4, xmm3, 0xdd + pavgb xmm2, xmm4 + + // step 2 - convert to U and V + // from here down is very similar to Y code except + // instead of 16 different pixels, its 8 pixels of U and 8 of V + movdqa xmm1, xmm0 + movdqa xmm3, xmm2 + pmaddubsw xmm0, xmm7 // U + pmaddubsw xmm2, xmm7 + pmaddubsw xmm1, xmm6 // V + pmaddubsw xmm3, xmm6 + phaddw xmm0, xmm2 + phaddw xmm1, xmm3 + psraw xmm0, 8 + psraw xmm1, 8 + packsswb xmm0, xmm1 + paddb xmm0, xmm5 // -> unsigned + + // step 3 - store 8 U and 8 V values + sub ecx, 16 + movlps qword ptr [edx], xmm0 // U + movhps qword ptr [edx + edi], xmm0 // V + lea edx, [edx + 8] + jg convertloop + + pop edi + ret + } +} + +__declspec(naked) __declspec(align(16)) +void ARGBToUV422Row_Unaligned_SSSE3(const uint8* src_argb0, + uint8* dst_u, uint8* dst_v, int width) { + __asm { + push edi + mov eax, [esp + 4 + 4] // src_argb + mov edx, [esp + 4 + 8] // dst_u + mov edi, [esp + 4 + 12] // dst_v + mov ecx, [esp + 4 + 16] // pix + movdqa xmm7, kARGBToU + movdqa xmm6, kARGBToV + movdqa xmm5, kAddUV128 + sub edi, edx // stride from u to v + + align 4 + convertloop: + /* step 1 - subsample 16x2 argb pixels to 8x1 */ + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + movdqu xmm2, [eax + 32] + movdqu xmm3, [eax + 48] + lea eax, [eax + 64] + movdqa xmm4, xmm0 + shufps xmm0, xmm1, 0x88 + shufps xmm4, xmm1, 0xdd + pavgb xmm0, xmm4 + movdqa xmm4, xmm2 + shufps xmm2, xmm3, 0x88 + shufps xmm4, xmm3, 0xdd + pavgb xmm2, xmm4 + + // step 2 - convert to U and V + // from here down is very similar to Y code except + // instead of 16 different pixels, its 8 pixels of U and 8 of V + movdqa xmm1, xmm0 + movdqa xmm3, xmm2 + pmaddubsw xmm0, xmm7 // U + pmaddubsw xmm2, xmm7 + pmaddubsw xmm1, xmm6 // V + pmaddubsw xmm3, xmm6 + phaddw xmm0, xmm2 + phaddw xmm1, xmm3 + psraw xmm0, 8 + psraw xmm1, 8 + packsswb xmm0, xmm1 + paddb xmm0, xmm5 // -> unsigned + + // step 3 - store 8 U and 8 V values + sub ecx, 16 + movlps qword ptr [edx], xmm0 // U + movhps qword ptr [edx + edi], xmm0 // V + lea edx, [edx + 8] + jg convertloop + + pop edi + ret + } +} + +__declspec(naked) __declspec(align(16)) +void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // src_argb + mov esi, [esp + 8 + 8] // src_stride_argb + mov edx, [esp + 8 + 12] // dst_u + mov edi, [esp + 8 + 16] // dst_v + mov ecx, [esp + 8 + 20] // pix + movdqa xmm7, kBGRAToU + movdqa xmm6, kBGRAToV + movdqa xmm5, kAddUV128 + sub edi, edx // stride from u to v + + align 4 + convertloop: + /* step 1 - subsample 16x2 argb pixels to 8x1 */ + movdqa xmm0, [eax] + movdqa xmm1, [eax + 16] + movdqa xmm2, [eax + 32] + movdqa xmm3, [eax + 48] + pavgb xmm0, [eax + esi] + pavgb xmm1, [eax + esi + 16] + pavgb xmm2, [eax + esi + 32] + pavgb xmm3, [eax + esi + 48] + lea eax, [eax + 64] + movdqa xmm4, xmm0 + shufps xmm0, xmm1, 0x88 + shufps xmm4, xmm1, 0xdd + pavgb xmm0, xmm4 + movdqa xmm4, xmm2 + shufps xmm2, xmm3, 0x88 + shufps xmm4, xmm3, 0xdd + pavgb xmm2, xmm4 + + // step 2 - convert to U and V + // from here down is very similar to Y code except + // instead of 16 different pixels, its 8 pixels of U and 8 of V + movdqa xmm1, xmm0 + movdqa xmm3, xmm2 + pmaddubsw xmm0, xmm7 // U + pmaddubsw xmm2, xmm7 + pmaddubsw xmm1, xmm6 // V + pmaddubsw xmm3, xmm6 + phaddw xmm0, xmm2 + phaddw xmm1, xmm3 + psraw xmm0, 8 + psraw xmm1, 8 + packsswb xmm0, xmm1 + paddb xmm0, xmm5 // -> unsigned + + // step 3 - store 8 U and 8 V values + sub ecx, 16 + movlps qword ptr [edx], xmm0 // U + movhps qword ptr [edx + edi], xmm0 // V + lea edx, [edx + 8] + jg convertloop + + pop edi + pop esi + ret + } +} + +__declspec(naked) __declspec(align(16)) +void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // src_argb + mov esi, [esp + 8 + 8] // src_stride_argb + mov edx, [esp + 8 + 12] // dst_u + mov edi, [esp + 8 + 16] // dst_v + mov ecx, [esp + 8 + 20] // pix + movdqa xmm7, kBGRAToU + movdqa xmm6, kBGRAToV + movdqa xmm5, kAddUV128 + sub edi, edx // stride from u to v + + align 4 + convertloop: + /* step 1 - subsample 16x2 argb pixels to 8x1 */ + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + movdqu xmm2, [eax + 32] + movdqu xmm3, [eax + 48] + movdqu xmm4, [eax + esi] + pavgb xmm0, xmm4 + movdqu xmm4, [eax + esi + 16] + pavgb xmm1, xmm4 + movdqu xmm4, [eax + esi + 32] + pavgb xmm2, xmm4 + movdqu xmm4, [eax + esi + 48] + pavgb xmm3, xmm4 + lea eax, [eax + 64] + movdqa xmm4, xmm0 + shufps xmm0, xmm1, 0x88 + shufps xmm4, xmm1, 0xdd + pavgb xmm0, xmm4 + movdqa xmm4, xmm2 + shufps xmm2, xmm3, 0x88 + shufps xmm4, xmm3, 0xdd + pavgb xmm2, xmm4 + + // step 2 - convert to U and V + // from here down is very similar to Y code except + // instead of 16 different pixels, its 8 pixels of U and 8 of V + movdqa xmm1, xmm0 + movdqa xmm3, xmm2 + pmaddubsw xmm0, xmm7 // U + pmaddubsw xmm2, xmm7 + pmaddubsw xmm1, xmm6 // V + pmaddubsw xmm3, xmm6 + phaddw xmm0, xmm2 + phaddw xmm1, xmm3 + psraw xmm0, 8 + psraw xmm1, 8 + packsswb xmm0, xmm1 + paddb xmm0, xmm5 // -> unsigned + + // step 3 - store 8 U and 8 V values + sub ecx, 16 + movlps qword ptr [edx], xmm0 // U + movhps qword ptr [edx + edi], xmm0 // V + lea edx, [edx + 8] + jg convertloop + + pop edi + pop esi + ret + } +} + +__declspec(naked) __declspec(align(16)) +void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // src_argb + mov esi, [esp + 8 + 8] // src_stride_argb + mov edx, [esp + 8 + 12] // dst_u + mov edi, [esp + 8 + 16] // dst_v + mov ecx, [esp + 8 + 20] // pix + movdqa xmm7, kABGRToU + movdqa xmm6, kABGRToV + movdqa xmm5, kAddUV128 + sub edi, edx // stride from u to v + + align 4 + convertloop: + /* step 1 - subsample 16x2 argb pixels to 8x1 */ + movdqa xmm0, [eax] + movdqa xmm1, [eax + 16] + movdqa xmm2, [eax + 32] + movdqa xmm3, [eax + 48] + pavgb xmm0, [eax + esi] + pavgb xmm1, [eax + esi + 16] + pavgb xmm2, [eax + esi + 32] + pavgb xmm3, [eax + esi + 48] + lea eax, [eax + 64] + movdqa xmm4, xmm0 + shufps xmm0, xmm1, 0x88 + shufps xmm4, xmm1, 0xdd + pavgb xmm0, xmm4 + movdqa xmm4, xmm2 + shufps xmm2, xmm3, 0x88 + shufps xmm4, xmm3, 0xdd + pavgb xmm2, xmm4 + + // step 2 - convert to U and V + // from here down is very similar to Y code except + // instead of 16 different pixels, its 8 pixels of U and 8 of V + movdqa xmm1, xmm0 + movdqa xmm3, xmm2 + pmaddubsw xmm0, xmm7 // U + pmaddubsw xmm2, xmm7 + pmaddubsw xmm1, xmm6 // V + pmaddubsw xmm3, xmm6 + phaddw xmm0, xmm2 + phaddw xmm1, xmm3 + psraw xmm0, 8 + psraw xmm1, 8 + packsswb xmm0, xmm1 + paddb xmm0, xmm5 // -> unsigned + + // step 3 - store 8 U and 8 V values + sub ecx, 16 + movlps qword ptr [edx], xmm0 // U + movhps qword ptr [edx + edi], xmm0 // V + lea edx, [edx + 8] + jg convertloop + + pop edi + pop esi + ret + } +} + +__declspec(naked) __declspec(align(16)) +void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // src_argb + mov esi, [esp + 8 + 8] // src_stride_argb + mov edx, [esp + 8 + 12] // dst_u + mov edi, [esp + 8 + 16] // dst_v + mov ecx, [esp + 8 + 20] // pix + movdqa xmm7, kABGRToU + movdqa xmm6, kABGRToV + movdqa xmm5, kAddUV128 + sub edi, edx // stride from u to v + + align 4 + convertloop: + /* step 1 - subsample 16x2 argb pixels to 8x1 */ + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + movdqu xmm2, [eax + 32] + movdqu xmm3, [eax + 48] + movdqu xmm4, [eax + esi] + pavgb xmm0, xmm4 + movdqu xmm4, [eax + esi + 16] + pavgb xmm1, xmm4 + movdqu xmm4, [eax + esi + 32] + pavgb xmm2, xmm4 + movdqu xmm4, [eax + esi + 48] + pavgb xmm3, xmm4 + lea eax, [eax + 64] + movdqa xmm4, xmm0 + shufps xmm0, xmm1, 0x88 + shufps xmm4, xmm1, 0xdd + pavgb xmm0, xmm4 + movdqa xmm4, xmm2 + shufps xmm2, xmm3, 0x88 + shufps xmm4, xmm3, 0xdd + pavgb xmm2, xmm4 + + // step 2 - convert to U and V + // from here down is very similar to Y code except + // instead of 16 different pixels, its 8 pixels of U and 8 of V + movdqa xmm1, xmm0 + movdqa xmm3, xmm2 + pmaddubsw xmm0, xmm7 // U + pmaddubsw xmm2, xmm7 + pmaddubsw xmm1, xmm6 // V + pmaddubsw xmm3, xmm6 + phaddw xmm0, xmm2 + phaddw xmm1, xmm3 + psraw xmm0, 8 + psraw xmm1, 8 + packsswb xmm0, xmm1 + paddb xmm0, xmm5 // -> unsigned + + // step 3 - store 8 U and 8 V values + sub ecx, 16 + movlps qword ptr [edx], xmm0 // U + movhps qword ptr [edx + edi], xmm0 // V + lea edx, [edx + 8] + jg convertloop + + pop edi + pop esi + ret + } +} + +__declspec(naked) __declspec(align(16)) +void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // src_argb + mov esi, [esp + 8 + 8] // src_stride_argb + mov edx, [esp + 8 + 12] // dst_u + mov edi, [esp + 8 + 16] // dst_v + mov ecx, [esp + 8 + 20] // pix + movdqa xmm7, kRGBAToU + movdqa xmm6, kRGBAToV + movdqa xmm5, kAddUV128 + sub edi, edx // stride from u to v + + align 4 + convertloop: + /* step 1 - subsample 16x2 argb pixels to 8x1 */ + movdqa xmm0, [eax] + movdqa xmm1, [eax + 16] + movdqa xmm2, [eax + 32] + movdqa xmm3, [eax + 48] + pavgb xmm0, [eax + esi] + pavgb xmm1, [eax + esi + 16] + pavgb xmm2, [eax + esi + 32] + pavgb xmm3, [eax + esi + 48] + lea eax, [eax + 64] + movdqa xmm4, xmm0 + shufps xmm0, xmm1, 0x88 + shufps xmm4, xmm1, 0xdd + pavgb xmm0, xmm4 + movdqa xmm4, xmm2 + shufps xmm2, xmm3, 0x88 + shufps xmm4, xmm3, 0xdd + pavgb xmm2, xmm4 + + // step 2 - convert to U and V + // from here down is very similar to Y code except + // instead of 16 different pixels, its 8 pixels of U and 8 of V + movdqa xmm1, xmm0 + movdqa xmm3, xmm2 + pmaddubsw xmm0, xmm7 // U + pmaddubsw xmm2, xmm7 + pmaddubsw xmm1, xmm6 // V + pmaddubsw xmm3, xmm6 + phaddw xmm0, xmm2 + phaddw xmm1, xmm3 + psraw xmm0, 8 + psraw xmm1, 8 + packsswb xmm0, xmm1 + paddb xmm0, xmm5 // -> unsigned + + // step 3 - store 8 U and 8 V values + sub ecx, 16 + movlps qword ptr [edx], xmm0 // U + movhps qword ptr [edx + edi], xmm0 // V + lea edx, [edx + 8] + jg convertloop + + pop edi + pop esi + ret + } +} + +__declspec(naked) __declspec(align(16)) +void RGBAToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // src_argb + mov esi, [esp + 8 + 8] // src_stride_argb + mov edx, [esp + 8 + 12] // dst_u + mov edi, [esp + 8 + 16] // dst_v + mov ecx, [esp + 8 + 20] // pix + movdqa xmm7, kRGBAToU + movdqa xmm6, kRGBAToV + movdqa xmm5, kAddUV128 + sub edi, edx // stride from u to v + + align 4 + convertloop: + /* step 1 - subsample 16x2 argb pixels to 8x1 */ + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + movdqu xmm2, [eax + 32] + movdqu xmm3, [eax + 48] + movdqu xmm4, [eax + esi] + pavgb xmm0, xmm4 + movdqu xmm4, [eax + esi + 16] + pavgb xmm1, xmm4 + movdqu xmm4, [eax + esi + 32] + pavgb xmm2, xmm4 + movdqu xmm4, [eax + esi + 48] + pavgb xmm3, xmm4 + lea eax, [eax + 64] + movdqa xmm4, xmm0 + shufps xmm0, xmm1, 0x88 + shufps xmm4, xmm1, 0xdd + pavgb xmm0, xmm4 + movdqa xmm4, xmm2 + shufps xmm2, xmm3, 0x88 + shufps xmm4, xmm3, 0xdd + pavgb xmm2, xmm4 + + // step 2 - convert to U and V + // from here down is very similar to Y code except + // instead of 16 different pixels, its 8 pixels of U and 8 of V + movdqa xmm1, xmm0 + movdqa xmm3, xmm2 + pmaddubsw xmm0, xmm7 // U + pmaddubsw xmm2, xmm7 + pmaddubsw xmm1, xmm6 // V + pmaddubsw xmm3, xmm6 + phaddw xmm0, xmm2 + phaddw xmm1, xmm3 + psraw xmm0, 8 + psraw xmm1, 8 + packsswb xmm0, xmm1 + paddb xmm0, xmm5 // -> unsigned + + // step 3 - store 8 U and 8 V values + sub ecx, 16 + movlps qword ptr [edx], xmm0 // U + movhps qword ptr [edx + edi], xmm0 // V + lea edx, [edx + 8] + jg convertloop + + pop edi + pop esi + ret + } +} +#endif // HAS_ARGBTOYROW_SSSE3 + +#ifdef HAS_I422TOARGBROW_AVX2 + +static const lvec8 kUVToB_AVX = { + UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, + UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB +}; +static const lvec8 kUVToR_AVX = { + UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, + UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR +}; +static const lvec8 kUVToG_AVX = { + UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, + UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG +}; +static const lvec16 kYToRgb_AVX = { + YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG +}; +static const lvec16 kYSub16_AVX = { + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16 +}; +static const lvec16 kUVBiasB_AVX = { + BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB +}; +static const lvec16 kUVBiasG_AVX = { + BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG +}; +static const lvec16 kUVBiasR_AVX = { + BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR +}; + +// 16 pixels +// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). +__declspec(naked) __declspec(align(16)) +void I422ToARGBRow_AVX2(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* dst_argb, + int width) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // Y + mov esi, [esp + 8 + 8] // U + mov edi, [esp + 8 + 12] // V + mov edx, [esp + 8 + 16] // argb + mov ecx, [esp + 8 + 20] // width + sub edi, esi + vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha + vpxor ymm4, ymm4, ymm4 + + align 4 + convertloop: + vmovq xmm0, qword ptr [esi] // U + vmovq xmm1, qword ptr [esi + edi] // V + lea esi, [esi + 8] + vpunpcklbw ymm0, ymm0, ymm1 // UV + vpermq ymm0, ymm0, 0xd8 + vpunpcklwd ymm0, ymm0, ymm0 // UVUV + vpmaddubsw ymm2, ymm0, kUVToB_AVX // scale B UV + vpmaddubsw ymm1, ymm0, kUVToG_AVX // scale G UV + vpmaddubsw ymm0, ymm0, kUVToR_AVX // scale R UV + vpsubw ymm2, ymm2, kUVBiasB_AVX // unbias back to signed + vpsubw ymm1, ymm1, kUVBiasG_AVX + vpsubw ymm0, ymm0, kUVBiasR_AVX + + // Step 2: Find Y contribution to 16 R,G,B values + vmovdqu xmm3, [eax] // NOLINT + lea eax, [eax + 16] + vpermq ymm3, ymm3, 0xd8 + vpunpcklbw ymm3, ymm3, ymm4 + vpsubsw ymm3, ymm3, kYSub16_AVX + vpmullw ymm3, ymm3, kYToRgb_AVX + vpaddsw ymm2, ymm2, ymm3 // B += Y + vpaddsw ymm1, ymm1, ymm3 // G += Y + vpaddsw ymm0, ymm0, ymm3 // R += Y + vpsraw ymm2, ymm2, 6 + vpsraw ymm1, ymm1, 6 + vpsraw ymm0, ymm0, 6 + vpackuswb ymm2, ymm2, ymm2 // B + vpackuswb ymm1, ymm1, ymm1 // G + vpackuswb ymm0, ymm0, ymm0 // R + + // Step 3: Weave into ARGB + vpunpcklbw ymm2, ymm2, ymm1 // BG + vpermq ymm2, ymm2, 0xd8 + vpunpcklbw ymm0, ymm0, ymm5 // RA + vpermq ymm0, ymm0, 0xd8 + vpunpcklwd ymm1, ymm2, ymm0 // BGRA first 8 pixels + vpunpckhwd ymm2, ymm2, ymm0 // BGRA next 8 pixels + vmovdqu [edx], ymm1 + vmovdqu [edx + 32], ymm2 + lea edx, [edx + 64] + sub ecx, 16 + jg convertloop + vzeroupper + + pop edi + pop esi + ret + } +} +#endif // HAS_I422TOARGBROW_AVX2 + +#ifdef HAS_I422TOARGBROW_SSSE3 + +// TODO(fbarchard): Read that does half size on Y and treats 420 as 444. + +// Read 8 UV from 444. +#define READYUV444 __asm { \ + __asm movq xmm0, qword ptr [esi] /* U */ /* NOLINT */ \ + __asm movq xmm1, qword ptr [esi + edi] /* V */ /* NOLINT */ \ + __asm lea esi, [esi + 8] \ + __asm punpcklbw xmm0, xmm1 /* UV */ \ + } + +// Read 4 UV from 422, upsample to 8 UV. +#define READYUV422 __asm { \ + __asm movd xmm0, [esi] /* U */ \ + __asm movd xmm1, [esi + edi] /* V */ \ + __asm lea esi, [esi + 4] \ + __asm punpcklbw xmm0, xmm1 /* UV */ \ + __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \ + } + +// Read 2 UV from 411, upsample to 8 UV. +#define READYUV411 __asm { \ + __asm movzx ebx, word ptr [esi] /* U */ /* NOLINT */ \ + __asm movd xmm0, ebx \ + __asm movzx ebx, word ptr [esi + edi] /* V */ /* NOLINT */ \ + __asm movd xmm1, ebx \ + __asm lea esi, [esi + 2] \ + __asm punpcklbw xmm0, xmm1 /* UV */ \ + __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \ + __asm punpckldq xmm0, xmm0 /* UVUV (upsample) */ \ + } + +// Read 4 UV from NV12, upsample to 8 UV. +#define READNV12 __asm { \ + __asm movq xmm0, qword ptr [esi] /* UV */ /* NOLINT */ \ + __asm lea esi, [esi + 8] \ + __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \ + } + +// Convert 8 pixels: 8 UV and 8 Y. +#define YUVTORGB __asm { \ + /* Step 1: Find 4 UV contributions to 8 R,G,B values */ \ + __asm movdqa xmm1, xmm0 \ + __asm movdqa xmm2, xmm0 \ + __asm pmaddubsw xmm0, kUVToB /* scale B UV */ \ + __asm pmaddubsw xmm1, kUVToG /* scale G UV */ \ + __asm pmaddubsw xmm2, kUVToR /* scale R UV */ \ + __asm psubw xmm0, kUVBiasB /* unbias back to signed */ \ + __asm psubw xmm1, kUVBiasG \ + __asm psubw xmm2, kUVBiasR \ + /* Step 2: Find Y contribution to 8 R,G,B values */ \ + __asm movq xmm3, qword ptr [eax] /* NOLINT */ \ + __asm lea eax, [eax + 8] \ + __asm punpcklbw xmm3, xmm4 \ + __asm psubsw xmm3, kYSub16 \ + __asm pmullw xmm3, kYToRgb \ + __asm paddsw xmm0, xmm3 /* B += Y */ \ + __asm paddsw xmm1, xmm3 /* G += Y */ \ + __asm paddsw xmm2, xmm3 /* R += Y */ \ + __asm psraw xmm0, 6 \ + __asm psraw xmm1, 6 \ + __asm psraw xmm2, 6 \ + __asm packuswb xmm0, xmm0 /* B */ \ + __asm packuswb xmm1, xmm1 /* G */ \ + __asm packuswb xmm2, xmm2 /* R */ \ + } + +// Convert 8 pixels: 8 VU and 8 Y. +#define YVUTORGB __asm { \ + /* Step 1: Find 4 UV contributions to 8 R,G,B values */ \ + __asm movdqa xmm1, xmm0 \ + __asm movdqa xmm2, xmm0 \ + __asm pmaddubsw xmm0, kVUToB /* scale B UV */ \ + __asm pmaddubsw xmm1, kVUToG /* scale G UV */ \ + __asm pmaddubsw xmm2, kVUToR /* scale R UV */ \ + __asm psubw xmm0, kUVBiasB /* unbias back to signed */ \ + __asm psubw xmm1, kUVBiasG \ + __asm psubw xmm2, kUVBiasR \ + /* Step 2: Find Y contribution to 8 R,G,B values */ \ + __asm movq xmm3, qword ptr [eax] /* NOLINT */ \ + __asm lea eax, [eax + 8] \ + __asm punpcklbw xmm3, xmm4 \ + __asm psubsw xmm3, kYSub16 \ + __asm pmullw xmm3, kYToRgb \ + __asm paddsw xmm0, xmm3 /* B += Y */ \ + __asm paddsw xmm1, xmm3 /* G += Y */ \ + __asm paddsw xmm2, xmm3 /* R += Y */ \ + __asm psraw xmm0, 6 \ + __asm psraw xmm1, 6 \ + __asm psraw xmm2, 6 \ + __asm packuswb xmm0, xmm0 /* B */ \ + __asm packuswb xmm1, xmm1 /* G */ \ + __asm packuswb xmm2, xmm2 /* R */ \ + } + +// 8 pixels, dest aligned 16. +// 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes). +__declspec(naked) __declspec(align(16)) +void I444ToARGBRow_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* dst_argb, + int width) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // Y + mov esi, [esp + 8 + 8] // U + mov edi, [esp + 8 + 12] // V + mov edx, [esp + 8 + 16] // argb + mov ecx, [esp + 8 + 20] // width + sub edi, esi + pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha + pxor xmm4, xmm4 + + align 4 + convertloop: + READYUV444 + YUVTORGB + + // Step 3: Weave into ARGB + punpcklbw xmm0, xmm1 // BG + punpcklbw xmm2, xmm5 // RA + movdqa xmm1, xmm0 + punpcklwd xmm0, xmm2 // BGRA first 4 pixels + punpckhwd xmm1, xmm2 // BGRA next 4 pixels + movdqa [edx], xmm0 + movdqa [edx + 16], xmm1 + lea edx, [edx + 32] + sub ecx, 8 + jg convertloop + + pop edi + pop esi + ret + } +} + +// 8 pixels, dest aligned 16. +// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). +__declspec(naked) __declspec(align(16)) +void I422ToRGB24Row_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* dst_rgb24, + int width) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // Y + mov esi, [esp + 8 + 8] // U + mov edi, [esp + 8 + 12] // V + mov edx, [esp + 8 + 16] // rgb24 + mov ecx, [esp + 8 + 20] // width + sub edi, esi + pxor xmm4, xmm4 + movdqa xmm5, kShuffleMaskARGBToRGB24_0 + movdqa xmm6, kShuffleMaskARGBToRGB24 + + align 4 + convertloop: + READYUV422 + YUVTORGB + + // Step 3: Weave into RRGB + punpcklbw xmm0, xmm1 // BG + punpcklbw xmm2, xmm2 // RR + movdqa xmm1, xmm0 + punpcklwd xmm0, xmm2 // BGRR first 4 pixels + punpckhwd xmm1, xmm2 // BGRR next 4 pixels + pshufb xmm0, xmm5 // Pack into first 8 and last 4 bytes. + pshufb xmm1, xmm6 // Pack into first 12 bytes. + palignr xmm1, xmm0, 12 // last 4 bytes of xmm0 + 12 from xmm1 + movq qword ptr [edx], xmm0 // First 8 bytes + movdqu [edx + 8], xmm1 // Last 16 bytes. = 24 bytes, 8 RGB pixels. + lea edx, [edx + 24] + sub ecx, 8 + jg convertloop + + pop edi + pop esi + ret + } +} + +// 8 pixels, dest aligned 16. +// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). +__declspec(naked) __declspec(align(16)) +void I422ToRAWRow_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* dst_raw, + int width) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // Y + mov esi, [esp + 8 + 8] // U + mov edi, [esp + 8 + 12] // V + mov edx, [esp + 8 + 16] // raw + mov ecx, [esp + 8 + 20] // width + sub edi, esi + pxor xmm4, xmm4 + movdqa xmm5, kShuffleMaskARGBToRAW_0 + movdqa xmm6, kShuffleMaskARGBToRAW + + align 4 + convertloop: + READYUV422 + YUVTORGB + + // Step 3: Weave into RRGB + punpcklbw xmm0, xmm1 // BG + punpcklbw xmm2, xmm2 // RR + movdqa xmm1, xmm0 + punpcklwd xmm0, xmm2 // BGRR first 4 pixels + punpckhwd xmm1, xmm2 // BGRR next 4 pixels + pshufb xmm0, xmm5 // Pack into first 8 and last 4 bytes. + pshufb xmm1, xmm6 // Pack into first 12 bytes. + palignr xmm1, xmm0, 12 // last 4 bytes of xmm0 + 12 from xmm1 + movq qword ptr [edx], xmm0 // First 8 bytes + movdqu [edx + 8], xmm1 // Last 16 bytes. = 24 bytes, 8 RGB pixels. + lea edx, [edx + 24] + sub ecx, 8 + jg convertloop + + pop edi + pop esi + ret + } +} + +// 8 pixels, dest unaligned. +// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). +__declspec(naked) __declspec(align(16)) +void I422ToRGB565Row_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb565_buf, + int width) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // Y + mov esi, [esp + 8 + 8] // U + mov edi, [esp + 8 + 12] // V + mov edx, [esp + 8 + 16] // rgb565 + mov ecx, [esp + 8 + 20] // width + sub edi, esi + pxor xmm4, xmm4 + pcmpeqb xmm5, xmm5 // generate mask 0x0000001f + psrld xmm5, 27 + pcmpeqb xmm6, xmm6 // generate mask 0x000007e0 + psrld xmm6, 26 + pslld xmm6, 5 + pcmpeqb xmm7, xmm7 // generate mask 0xfffff800 + pslld xmm7, 11 + + align 4 + convertloop: + READYUV422 + YUVTORGB + + // Step 3: Weave into RRGB + punpcklbw xmm0, xmm1 // BG + punpcklbw xmm2, xmm2 // RR + movdqa xmm1, xmm0 + punpcklwd xmm0, xmm2 // BGRR first 4 pixels + punpckhwd xmm1, xmm2 // BGRR next 4 pixels + + // Step 3b: RRGB -> RGB565 + movdqa xmm3, xmm0 // B first 4 pixels of argb + movdqa xmm2, xmm0 // G + pslld xmm0, 8 // R + psrld xmm3, 3 // B + psrld xmm2, 5 // G + psrad xmm0, 16 // R + pand xmm3, xmm5 // B + pand xmm2, xmm6 // G + pand xmm0, xmm7 // R + por xmm3, xmm2 // BG + por xmm0, xmm3 // BGR + movdqa xmm3, xmm1 // B next 4 pixels of argb + movdqa xmm2, xmm1 // G + pslld xmm1, 8 // R + psrld xmm3, 3 // B + psrld xmm2, 5 // G + psrad xmm1, 16 // R + pand xmm3, xmm5 // B + pand xmm2, xmm6 // G + pand xmm1, xmm7 // R + por xmm3, xmm2 // BG + por xmm1, xmm3 // BGR + packssdw xmm0, xmm1 + sub ecx, 8 + movdqu [edx], xmm0 // store 8 pixels of RGB565 + lea edx, [edx + 16] + jg convertloop + + pop edi + pop esi + ret + } +} + +// 8 pixels, dest aligned 16. +// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). +__declspec(naked) __declspec(align(16)) +void I422ToARGBRow_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* dst_argb, + int width) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // Y + mov esi, [esp + 8 + 8] // U + mov edi, [esp + 8 + 12] // V + mov edx, [esp + 8 + 16] // argb + mov ecx, [esp + 8 + 20] // width + sub edi, esi + pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha + pxor xmm4, xmm4 + + align 4 + convertloop: + READYUV422 + YUVTORGB + + // Step 3: Weave into ARGB + punpcklbw xmm0, xmm1 // BG + punpcklbw xmm2, xmm5 // RA + movdqa xmm1, xmm0 + punpcklwd xmm0, xmm2 // BGRA first 4 pixels + punpckhwd xmm1, xmm2 // BGRA next 4 pixels + movdqa [edx], xmm0 + movdqa [edx + 16], xmm1 + lea edx, [edx + 32] + sub ecx, 8 + jg convertloop + + pop edi + pop esi + ret + } +} + +// 8 pixels, dest aligned 16. +// 2 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). +// Similar to I420 but duplicate UV once more. +__declspec(naked) __declspec(align(16)) +void I411ToARGBRow_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* dst_argb, + int width) { + __asm { + push ebx + push esi + push edi + mov eax, [esp + 12 + 4] // Y + mov esi, [esp + 12 + 8] // U + mov edi, [esp + 12 + 12] // V + mov edx, [esp + 12 + 16] // argb + mov ecx, [esp + 12 + 20] // width + sub edi, esi + pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha + pxor xmm4, xmm4 + + align 4 + convertloop: + READYUV411 // modifies EBX + YUVTORGB + + // Step 3: Weave into ARGB + punpcklbw xmm0, xmm1 // BG + punpcklbw xmm2, xmm5 // RA + movdqa xmm1, xmm0 + punpcklwd xmm0, xmm2 // BGRA first 4 pixels + punpckhwd xmm1, xmm2 // BGRA next 4 pixels + movdqa [edx], xmm0 + movdqa [edx + 16], xmm1 + lea edx, [edx + 32] + sub ecx, 8 + jg convertloop + + pop edi + pop esi + pop ebx + ret + } +} + +// 8 pixels, dest aligned 16. +// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). +__declspec(naked) __declspec(align(16)) +void NV12ToARGBRow_SSSE3(const uint8* y_buf, + const uint8* uv_buf, + uint8* dst_argb, + int width) { + __asm { + push esi + mov eax, [esp + 4 + 4] // Y + mov esi, [esp + 4 + 8] // UV + mov edx, [esp + 4 + 12] // argb + mov ecx, [esp + 4 + 16] // width + pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha + pxor xmm4, xmm4 + + align 4 + convertloop: + READNV12 + YUVTORGB + + // Step 3: Weave into ARGB + punpcklbw xmm0, xmm1 // BG + punpcklbw xmm2, xmm5 // RA + movdqa xmm1, xmm0 + punpcklwd xmm0, xmm2 // BGRA first 4 pixels + punpckhwd xmm1, xmm2 // BGRA next 4 pixels + movdqa [edx], xmm0 + movdqa [edx + 16], xmm1 + lea edx, [edx + 32] + sub ecx, 8 + jg convertloop + + pop esi + ret + } +} + +// 8 pixels, dest aligned 16. +// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). +__declspec(naked) __declspec(align(16)) +void NV21ToARGBRow_SSSE3(const uint8* y_buf, + const uint8* uv_buf, + uint8* dst_argb, + int width) { + __asm { + push esi + mov eax, [esp + 4 + 4] // Y + mov esi, [esp + 4 + 8] // VU + mov edx, [esp + 4 + 12] // argb + mov ecx, [esp + 4 + 16] // width + pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha + pxor xmm4, xmm4 + + align 4 + convertloop: + READNV12 + YVUTORGB + + // Step 3: Weave into ARGB + punpcklbw xmm0, xmm1 // BG + punpcklbw xmm2, xmm5 // RA + movdqa xmm1, xmm0 + punpcklwd xmm0, xmm2 // BGRA first 4 pixels + punpckhwd xmm1, xmm2 // BGRA next 4 pixels + movdqa [edx], xmm0 + movdqa [edx + 16], xmm1 + lea edx, [edx + 32] + sub ecx, 8 + jg convertloop + + pop esi + ret + } +} + +// 8 pixels, unaligned. +// 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes). +__declspec(naked) __declspec(align(16)) +void I444ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* dst_argb, + int width) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // Y + mov esi, [esp + 8 + 8] // U + mov edi, [esp + 8 + 12] // V + mov edx, [esp + 8 + 16] // argb + mov ecx, [esp + 8 + 20] // width + sub edi, esi + pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha + pxor xmm4, xmm4 + + align 4 + convertloop: + READYUV444 + YUVTORGB + + // Step 3: Weave into ARGB + punpcklbw xmm0, xmm1 // BG + punpcklbw xmm2, xmm5 // RA + movdqa xmm1, xmm0 + punpcklwd xmm0, xmm2 // BGRA first 4 pixels + punpckhwd xmm1, xmm2 // BGRA next 4 pixels + movdqu [edx], xmm0 + movdqu [edx + 16], xmm1 + lea edx, [edx + 32] + sub ecx, 8 + jg convertloop + + pop edi + pop esi + ret + } +} + +// 8 pixels, unaligned. +// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). +__declspec(naked) __declspec(align(16)) +void I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* dst_argb, + int width) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // Y + mov esi, [esp + 8 + 8] // U + mov edi, [esp + 8 + 12] // V + mov edx, [esp + 8 + 16] // argb + mov ecx, [esp + 8 + 20] // width + sub edi, esi + pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha + pxor xmm4, xmm4 + + align 4 + convertloop: + READYUV422 + YUVTORGB + + // Step 3: Weave into ARGB + punpcklbw xmm0, xmm1 // BG + punpcklbw xmm2, xmm5 // RA + movdqa xmm1, xmm0 + punpcklwd xmm0, xmm2 // BGRA first 4 pixels + punpckhwd xmm1, xmm2 // BGRA next 4 pixels + movdqu [edx], xmm0 + movdqu [edx + 16], xmm1 + lea edx, [edx + 32] + sub ecx, 8 + jg convertloop + + pop edi + pop esi + ret + } +} + +// 8 pixels, unaligned. +// 2 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). +// Similar to I420 but duplicate UV once more. +__declspec(naked) __declspec(align(16)) +void I411ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* dst_argb, + int width) { + __asm { + push ebx + push esi + push edi + mov eax, [esp + 12 + 4] // Y + mov esi, [esp + 12 + 8] // U + mov edi, [esp + 12 + 12] // V + mov edx, [esp + 12 + 16] // argb + mov ecx, [esp + 12 + 20] // width + sub edi, esi + pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha + pxor xmm4, xmm4 + + align 4 + convertloop: + READYUV411 // modifies EBX + YUVTORGB + + // Step 3: Weave into ARGB + punpcklbw xmm0, xmm1 // BG + punpcklbw xmm2, xmm5 // RA + movdqa xmm1, xmm0 + punpcklwd xmm0, xmm2 // BGRA first 4 pixels + punpckhwd xmm1, xmm2 // BGRA next 4 pixels + movdqu [edx], xmm0 + movdqu [edx + 16], xmm1 + lea edx, [edx + 32] + sub ecx, 8 + jg convertloop + + pop edi + pop esi + pop ebx + ret + } +} + +// 8 pixels, dest aligned 16. +// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). +__declspec(naked) __declspec(align(16)) +void NV12ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, + const uint8* uv_buf, + uint8* dst_argb, + int width) { + __asm { + push esi + mov eax, [esp + 4 + 4] // Y + mov esi, [esp + 4 + 8] // UV + mov edx, [esp + 4 + 12] // argb + mov ecx, [esp + 4 + 16] // width + pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha + pxor xmm4, xmm4 + + align 4 + convertloop: + READNV12 + YUVTORGB + + // Step 3: Weave into ARGB + punpcklbw xmm0, xmm1 // BG + punpcklbw xmm2, xmm5 // RA + movdqa xmm1, xmm0 + punpcklwd xmm0, xmm2 // BGRA first 4 pixels + punpckhwd xmm1, xmm2 // BGRA next 4 pixels + movdqu [edx], xmm0 + movdqu [edx + 16], xmm1 + lea edx, [edx + 32] + sub ecx, 8 + jg convertloop + + pop esi + ret + } +} + +// 8 pixels, dest aligned 16. +// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). +__declspec(naked) __declspec(align(16)) +void NV21ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, + const uint8* uv_buf, + uint8* dst_argb, + int width) { + __asm { + push esi + mov eax, [esp + 4 + 4] // Y + mov esi, [esp + 4 + 8] // VU + mov edx, [esp + 4 + 12] // argb + mov ecx, [esp + 4 + 16] // width + pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha + pxor xmm4, xmm4 + + align 4 + convertloop: + READNV12 + YVUTORGB + + // Step 3: Weave into ARGB + punpcklbw xmm0, xmm1 // BG + punpcklbw xmm2, xmm5 // RA + movdqa xmm1, xmm0 + punpcklwd xmm0, xmm2 // BGRA first 4 pixels + punpckhwd xmm1, xmm2 // BGRA next 4 pixels + movdqu [edx], xmm0 + movdqu [edx + 16], xmm1 + lea edx, [edx + 32] + sub ecx, 8 + jg convertloop + + pop esi + ret + } +} + +__declspec(naked) __declspec(align(16)) +void I422ToBGRARow_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* dst_bgra, + int width) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // Y + mov esi, [esp + 8 + 8] // U + mov edi, [esp + 8 + 12] // V + mov edx, [esp + 8 + 16] // bgra + mov ecx, [esp + 8 + 20] // width + sub edi, esi + pxor xmm4, xmm4 + + align 4 + convertloop: + READYUV422 + YUVTORGB + + // Step 3: Weave into BGRA + pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha + punpcklbw xmm1, xmm0 // GB + punpcklbw xmm5, xmm2 // AR + movdqa xmm0, xmm5 + punpcklwd xmm5, xmm1 // BGRA first 4 pixels + punpckhwd xmm0, xmm1 // BGRA next 4 pixels + movdqa [edx], xmm5 + movdqa [edx + 16], xmm0 + lea edx, [edx + 32] + sub ecx, 8 + jg convertloop + + pop edi + pop esi + ret + } +} + +__declspec(naked) __declspec(align(16)) +void I422ToBGRARow_Unaligned_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* dst_bgra, + int width) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // Y + mov esi, [esp + 8 + 8] // U + mov edi, [esp + 8 + 12] // V + mov edx, [esp + 8 + 16] // bgra + mov ecx, [esp + 8 + 20] // width + sub edi, esi + pxor xmm4, xmm4 + + align 4 + convertloop: + READYUV422 + YUVTORGB + + // Step 3: Weave into BGRA + pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha + punpcklbw xmm1, xmm0 // GB + punpcklbw xmm5, xmm2 // AR + movdqa xmm0, xmm5 + punpcklwd xmm5, xmm1 // BGRA first 4 pixels + punpckhwd xmm0, xmm1 // BGRA next 4 pixels + movdqu [edx], xmm5 + movdqu [edx + 16], xmm0 + lea edx, [edx + 32] + sub ecx, 8 + jg convertloop + + pop edi + pop esi + ret + } +} + +__declspec(naked) __declspec(align(16)) +void I422ToABGRRow_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* dst_abgr, + int width) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // Y + mov esi, [esp + 8 + 8] // U + mov edi, [esp + 8 + 12] // V + mov edx, [esp + 8 + 16] // abgr + mov ecx, [esp + 8 + 20] // width + sub edi, esi + pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha + pxor xmm4, xmm4 + + align 4 + convertloop: + READYUV422 + YUVTORGB + + // Step 3: Weave into ARGB + punpcklbw xmm2, xmm1 // RG + punpcklbw xmm0, xmm5 // BA + movdqa xmm1, xmm2 + punpcklwd xmm2, xmm0 // RGBA first 4 pixels + punpckhwd xmm1, xmm0 // RGBA next 4 pixels + movdqa [edx], xmm2 + movdqa [edx + 16], xmm1 + lea edx, [edx + 32] + sub ecx, 8 + jg convertloop + + pop edi + pop esi + ret + } +} + +__declspec(naked) __declspec(align(16)) +void I422ToABGRRow_Unaligned_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* dst_abgr, + int width) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // Y + mov esi, [esp + 8 + 8] // U + mov edi, [esp + 8 + 12] // V + mov edx, [esp + 8 + 16] // abgr + mov ecx, [esp + 8 + 20] // width + sub edi, esi + pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha + pxor xmm4, xmm4 + + align 4 + convertloop: + READYUV422 + YUVTORGB + + // Step 3: Weave into ARGB + punpcklbw xmm2, xmm1 // RG + punpcklbw xmm0, xmm5 // BA + movdqa xmm1, xmm2 + punpcklwd xmm2, xmm0 // RGBA first 4 pixels + punpckhwd xmm1, xmm0 // RGBA next 4 pixels + movdqu [edx], xmm2 + movdqu [edx + 16], xmm1 + lea edx, [edx + 32] + sub ecx, 8 + jg convertloop + + pop edi + pop esi + ret + } +} + +__declspec(naked) __declspec(align(16)) +void I422ToRGBARow_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* dst_rgba, + int width) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // Y + mov esi, [esp + 8 + 8] // U + mov edi, [esp + 8 + 12] // V + mov edx, [esp + 8 + 16] // rgba + mov ecx, [esp + 8 + 20] // width + sub edi, esi + pxor xmm4, xmm4 + + align 4 + convertloop: + READYUV422 + YUVTORGB + + // Step 3: Weave into RGBA + pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha + punpcklbw xmm1, xmm2 // GR + punpcklbw xmm5, xmm0 // AB + movdqa xmm0, xmm5 + punpcklwd xmm5, xmm1 // RGBA first 4 pixels + punpckhwd xmm0, xmm1 // RGBA next 4 pixels + movdqa [edx], xmm5 + movdqa [edx + 16], xmm0 + lea edx, [edx + 32] + sub ecx, 8 + jg convertloop + + pop edi + pop esi + ret + } +} + +__declspec(naked) __declspec(align(16)) +void I422ToRGBARow_Unaligned_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* dst_rgba, + int width) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // Y + mov esi, [esp + 8 + 8] // U + mov edi, [esp + 8 + 12] // V + mov edx, [esp + 8 + 16] // rgba + mov ecx, [esp + 8 + 20] // width + sub edi, esi + pxor xmm4, xmm4 + + align 4 + convertloop: + READYUV422 + YUVTORGB + + // Step 3: Weave into RGBA + pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha + punpcklbw xmm1, xmm2 // GR + punpcklbw xmm5, xmm0 // AB + movdqa xmm0, xmm5 + punpcklwd xmm5, xmm1 // RGBA first 4 pixels + punpckhwd xmm0, xmm1 // RGBA next 4 pixels + movdqu [edx], xmm5 + movdqu [edx + 16], xmm0 + lea edx, [edx + 32] + sub ecx, 8 + jg convertloop + + pop edi + pop esi + ret + } +} + +#endif // HAS_I422TOARGBROW_SSSE3 + +#ifdef HAS_YTOARGBROW_SSE2 +__declspec(naked) __declspec(align(16)) +void YToARGBRow_SSE2(const uint8* y_buf, + uint8* rgb_buf, + int width) { + __asm { + pxor xmm5, xmm5 + pcmpeqb xmm4, xmm4 // generate mask 0xff000000 + pslld xmm4, 24 + mov eax, 0x00100010 + movd xmm3, eax + pshufd xmm3, xmm3, 0 + mov eax, 0x004a004a // 74 + movd xmm2, eax + pshufd xmm2, xmm2,0 + mov eax, [esp + 4] // Y + mov edx, [esp + 8] // rgb + mov ecx, [esp + 12] // width + + align 4 + convertloop: + // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164 + movq xmm0, qword ptr [eax] + lea eax, [eax + 8] + punpcklbw xmm0, xmm5 // 0.Y + psubusw xmm0, xmm3 + pmullw xmm0, xmm2 + psrlw xmm0, 6 + packuswb xmm0, xmm0 // G + + // Step 2: Weave into ARGB + punpcklbw xmm0, xmm0 // GG + movdqa xmm1, xmm0 + punpcklwd xmm0, xmm0 // BGRA first 4 pixels + punpckhwd xmm1, xmm1 // BGRA next 4 pixels + por xmm0, xmm4 + por xmm1, xmm4 + movdqa [edx], xmm0 + movdqa [edx + 16], xmm1 + lea edx, [edx + 32] + sub ecx, 8 + jg convertloop + + ret + } +} +#endif // HAS_YTOARGBROW_SSE2 + +#ifdef HAS_MIRRORROW_SSSE3 +// Shuffle table for reversing the bytes. +static const uvec8 kShuffleMirror = { + 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u +}; + +__declspec(naked) __declspec(align(16)) +void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) { + __asm { + mov eax, [esp + 4] // src + mov edx, [esp + 8] // dst + mov ecx, [esp + 12] // width + movdqa xmm5, kShuffleMirror + lea eax, [eax - 16] + + align 4 + convertloop: + movdqa xmm0, [eax + ecx] + pshufb xmm0, xmm5 + sub ecx, 16 + movdqa [edx], xmm0 + lea edx, [edx + 16] + jg convertloop + ret + } +} +#endif // HAS_MIRRORROW_SSSE3 + +#ifdef HAS_MIRRORROW_AVX2 +// Shuffle table for reversing the bytes. +static const ulvec8 kShuffleMirror_AVX2 = { + 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u, + 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u +}; + +__declspec(naked) __declspec(align(16)) +void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) { + __asm { + mov eax, [esp + 4] // src + mov edx, [esp + 8] // dst + mov ecx, [esp + 12] // width + vmovdqa ymm5, kShuffleMirror_AVX2 + lea eax, [eax - 32] + + align 4 + convertloop: + vmovdqu ymm0, [eax + ecx] + vpshufb ymm0, ymm0, ymm5 + vpermq ymm0, ymm0, 0x4e // swap high and low halfs + sub ecx, 32 + vmovdqu [edx], ymm0 + lea edx, [edx + 32] + jg convertloop + vzeroupper + ret + } +} +#endif // HAS_MIRRORROW_AVX2 + +#ifdef HAS_MIRRORROW_SSE2 +// SSE2 version has movdqu so it can be used on unaligned buffers when SSSE3 +// version can not. +__declspec(naked) __declspec(align(16)) +void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) { + __asm { + mov eax, [esp + 4] // src + mov edx, [esp + 8] // dst + mov ecx, [esp + 12] // width + lea eax, [eax - 16] + + align 4 + convertloop: + movdqu xmm0, [eax + ecx] + movdqa xmm1, xmm0 // swap bytes + psllw xmm0, 8 + psrlw xmm1, 8 + por xmm0, xmm1 + pshuflw xmm0, xmm0, 0x1b // swap words + pshufhw xmm0, xmm0, 0x1b + pshufd xmm0, xmm0, 0x4e // swap qwords + sub ecx, 16 + movdqu [edx], xmm0 + lea edx, [edx + 16] + jg convertloop + ret + } +} +#endif // HAS_MIRRORROW_SSE2 + +#ifdef HAS_MIRRORROW_UV_SSSE3 +// Shuffle table for reversing the bytes of UV channels. +static const uvec8 kShuffleMirrorUV = { + 14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u +}; + +__declspec(naked) __declspec(align(16)) +void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v, + int width) { + __asm { + push edi + mov eax, [esp + 4 + 4] // src + mov edx, [esp + 4 + 8] // dst_u + mov edi, [esp + 4 + 12] // dst_v + mov ecx, [esp + 4 + 16] // width + movdqa xmm1, kShuffleMirrorUV + lea eax, [eax + ecx * 2 - 16] + sub edi, edx + + align 4 + convertloop: + movdqa xmm0, [eax] + lea eax, [eax - 16] + pshufb xmm0, xmm1 + sub ecx, 8 + movlpd qword ptr [edx], xmm0 + movhpd qword ptr [edx + edi], xmm0 + lea edx, [edx + 8] + jg convertloop + + pop edi + ret + } +} +#endif // HAS_MIRRORROW_UV_SSSE3 + +#ifdef HAS_ARGBMIRRORROW_SSSE3 +// Shuffle table for reversing the bytes. +static const uvec8 kARGBShuffleMirror = { + 12u, 13u, 14u, 15u, 8u, 9u, 10u, 11u, 4u, 5u, 6u, 7u, 0u, 1u, 2u, 3u +}; + +__declspec(naked) __declspec(align(16)) +void ARGBMirrorRow_SSSE3(const uint8* src, uint8* dst, int width) { + __asm { + mov eax, [esp + 4] // src + mov edx, [esp + 8] // dst + mov ecx, [esp + 12] // width + lea eax, [eax - 16 + ecx * 4] // last 4 pixels. + movdqa xmm5, kARGBShuffleMirror + + align 4 + convertloop: + movdqa xmm0, [eax] + lea eax, [eax - 16] + pshufb xmm0, xmm5 + sub ecx, 4 + movdqa [edx], xmm0 + lea edx, [edx + 16] + jg convertloop + ret + } +} +#endif // HAS_ARGBMIRRORROW_SSSE3 + +#ifdef HAS_ARGBMIRRORROW_AVX2 +// Shuffle table for reversing the bytes. +static const ulvec32 kARGBShuffleMirror_AVX2 = { + 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u +}; + +__declspec(naked) __declspec(align(16)) +void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) { + __asm { + mov eax, [esp + 4] // src + mov edx, [esp + 8] // dst + mov ecx, [esp + 12] // width + lea eax, [eax - 32] + vmovdqa ymm5, kARGBShuffleMirror_AVX2 + + align 4 + convertloop: + vpermd ymm0, ymm5, [eax + ecx * 4] // permute dword order + sub ecx, 8 + vmovdqu [edx], ymm0 + lea edx, [edx + 32] + jg convertloop + vzeroupper + ret + } +} +#endif // HAS_ARGBMIRRORROW_AVX2 + +#ifdef HAS_SPLITUVROW_SSE2 +__declspec(naked) __declspec(align(16)) +void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) { + __asm { + push edi + mov eax, [esp + 4 + 4] // src_uv + mov edx, [esp + 4 + 8] // dst_u + mov edi, [esp + 4 + 12] // dst_v + mov ecx, [esp + 4 + 16] // pix + pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff + psrlw xmm5, 8 + sub edi, edx + + align 4 + convertloop: + movdqa xmm0, [eax] + movdqa xmm1, [eax + 16] + lea eax, [eax + 32] + movdqa xmm2, xmm0 + movdqa xmm3, xmm1 + pand xmm0, xmm5 // even bytes + pand xmm1, xmm5 + packuswb xmm0, xmm1 + psrlw xmm2, 8 // odd bytes + psrlw xmm3, 8 + packuswb xmm2, xmm3 + movdqa [edx], xmm0 + movdqa [edx + edi], xmm2 + lea edx, [edx + 16] + sub ecx, 16 + jg convertloop + + pop edi + ret + } +} + +__declspec(naked) __declspec(align(16)) +void SplitUVRow_Unaligned_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, + int pix) { + __asm { + push edi + mov eax, [esp + 4 + 4] // src_uv + mov edx, [esp + 4 + 8] // dst_u + mov edi, [esp + 4 + 12] // dst_v + mov ecx, [esp + 4 + 16] // pix + pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff + psrlw xmm5, 8 + sub edi, edx + + align 4 + convertloop: + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + lea eax, [eax + 32] + movdqa xmm2, xmm0 + movdqa xmm3, xmm1 + pand xmm0, xmm5 // even bytes + pand xmm1, xmm5 + packuswb xmm0, xmm1 + psrlw xmm2, 8 // odd bytes + psrlw xmm3, 8 + packuswb xmm2, xmm3 + movdqu [edx], xmm0 + movdqu [edx + edi], xmm2 + lea edx, [edx + 16] + sub ecx, 16 + jg convertloop + + pop edi + ret + } +} +#endif // HAS_SPLITUVROW_SSE2 + +#ifdef HAS_SPLITUVROW_AVX2 +__declspec(naked) __declspec(align(16)) +void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) { + __asm { + push edi + mov eax, [esp + 4 + 4] // src_uv + mov edx, [esp + 4 + 8] // dst_u + mov edi, [esp + 4 + 12] // dst_v + mov ecx, [esp + 4 + 16] // pix + vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff + vpsrlw ymm5, ymm5, 8 + sub edi, edx + + align 4 + convertloop: + vmovdqu ymm0, [eax] + vmovdqu ymm1, [eax + 32] + lea eax, [eax + 64] + vpsrlw ymm2, ymm0, 8 // odd bytes + vpsrlw ymm3, ymm1, 8 + vpand ymm0, ymm0, ymm5 // even bytes + vpand ymm1, ymm1, ymm5 + vpackuswb ymm0, ymm0, ymm1 + vpackuswb ymm2, ymm2, ymm3 + vpermq ymm0, ymm0, 0xd8 + vpermq ymm2, ymm2, 0xd8 + vmovdqu [edx], ymm0 + vmovdqu [edx + edi], ymm2 + lea edx, [edx + 32] + sub ecx, 32 + jg convertloop + + pop edi + vzeroupper + ret + } +} +#endif // HAS_SPLITUVROW_AVX2 + +#ifdef HAS_MERGEUVROW_SSE2 +__declspec(naked) __declspec(align(16)) +void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, + int width) { + __asm { + push edi + mov eax, [esp + 4 + 4] // src_u + mov edx, [esp + 4 + 8] // src_v + mov edi, [esp + 4 + 12] // dst_uv + mov ecx, [esp + 4 + 16] // width + sub edx, eax + + align 4 + convertloop: + movdqa xmm0, [eax] // read 16 U's + movdqa xmm1, [eax + edx] // and 16 V's + lea eax, [eax + 16] + movdqa xmm2, xmm0 + punpcklbw xmm0, xmm1 // first 8 UV pairs + punpckhbw xmm2, xmm1 // next 8 UV pairs + movdqa [edi], xmm0 + movdqa [edi + 16], xmm2 + lea edi, [edi + 32] + sub ecx, 16 + jg convertloop + + pop edi + ret + } +} + +__declspec(naked) __declspec(align(16)) +void MergeUVRow_Unaligned_SSE2(const uint8* src_u, const uint8* src_v, + uint8* dst_uv, int width) { + __asm { + push edi + mov eax, [esp + 4 + 4] // src_u + mov edx, [esp + 4 + 8] // src_v + mov edi, [esp + 4 + 12] // dst_uv + mov ecx, [esp + 4 + 16] // width + sub edx, eax + + align 4 + convertloop: + movdqu xmm0, [eax] // read 16 U's + movdqu xmm1, [eax + edx] // and 16 V's + lea eax, [eax + 16] + movdqa xmm2, xmm0 + punpcklbw xmm0, xmm1 // first 8 UV pairs + punpckhbw xmm2, xmm1 // next 8 UV pairs + movdqu [edi], xmm0 + movdqu [edi + 16], xmm2 + lea edi, [edi + 32] + sub ecx, 16 + jg convertloop + + pop edi + ret + } +} +#endif // HAS_MERGEUVROW_SSE2 + +#ifdef HAS_MERGEUVROW_AVX2 +__declspec(naked) __declspec(align(16)) +void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, + int width) { + __asm { + push edi + mov eax, [esp + 4 + 4] // src_u + mov edx, [esp + 4 + 8] // src_v + mov edi, [esp + 4 + 12] // dst_uv + mov ecx, [esp + 4 + 16] // width + sub edx, eax + + align 4 + convertloop: + vmovdqu ymm0, [eax] // read 32 U's + vmovdqu ymm1, [eax + edx] // and 32 V's + lea eax, [eax + 32] + vpunpcklbw ymm2, ymm0, ymm1 // low 16 UV pairs. mutated qqword 0,2 + vpunpckhbw ymm0, ymm0, ymm1 // high 16 UV pairs. mutated qqword 1,3 + vperm2i128 ymm1, ymm2, ymm0, 0x20 // low 128 of ymm2 and low 128 of ymm0 + vperm2i128 ymm2, ymm2, ymm0, 0x31 // high 128 of ymm2 and high 128 of ymm0 + vmovdqu [edi], ymm1 + vmovdqu [edi + 32], ymm2 + lea edi, [edi + 64] + sub ecx, 32 + jg convertloop + + pop edi + vzeroupper + ret + } +} +#endif // HAS_MERGEUVROW_AVX2 + +#ifdef HAS_COPYROW_SSE2 +// CopyRow copys 'count' bytes using a 16 byte load/store, 32 bytes at time. +__declspec(naked) __declspec(align(16)) +void CopyRow_SSE2(const uint8* src, uint8* dst, int count) { + __asm { + mov eax, [esp + 4] // src + mov edx, [esp + 8] // dst + mov ecx, [esp + 12] // count + + align 4 + convertloop: + movdqa xmm0, [eax] + movdqa xmm1, [eax + 16] + lea eax, [eax + 32] + movdqa [edx], xmm0 + movdqa [edx + 16], xmm1 + lea edx, [edx + 32] + sub ecx, 32 + jg convertloop + ret + } +} +#endif // HAS_COPYROW_SSE2 + +// Unaligned Multiple of 1. +__declspec(naked) __declspec(align(16)) +void CopyRow_ERMS(const uint8* src, uint8* dst, int count) { + __asm { + mov eax, esi + mov edx, edi + mov esi, [esp + 4] // src + mov edi, [esp + 8] // dst + mov ecx, [esp + 12] // count + rep movsb + mov edi, edx + mov esi, eax + ret + } +} + +#ifdef HAS_COPYROW_X86 +__declspec(naked) __declspec(align(16)) +void CopyRow_X86(const uint8* src, uint8* dst, int count) { + __asm { + mov eax, esi + mov edx, edi + mov esi, [esp + 4] // src + mov edi, [esp + 8] // dst + mov ecx, [esp + 12] // count + shr ecx, 2 + rep movsd + mov edi, edx + mov esi, eax + ret + } +} +#endif // HAS_COPYROW_X86 + +#ifdef HAS_ARGBCOPYALPHAROW_SSE2 +// width in pixels +__declspec(naked) __declspec(align(16)) +void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) { + __asm { + mov eax, [esp + 4] // src + mov edx, [esp + 8] // dst + mov ecx, [esp + 12] // count + pcmpeqb xmm0, xmm0 // generate mask 0xff000000 + pslld xmm0, 24 + pcmpeqb xmm1, xmm1 // generate mask 0x00ffffff + psrld xmm1, 8 + + align 4 + convertloop: + movdqa xmm2, [eax] + movdqa xmm3, [eax + 16] + lea eax, [eax + 32] + movdqa xmm4, [edx] + movdqa xmm5, [edx + 16] + pand xmm2, xmm0 + pand xmm3, xmm0 + pand xmm4, xmm1 + pand xmm5, xmm1 + por xmm2, xmm4 + por xmm3, xmm5 + movdqa [edx], xmm2 + movdqa [edx + 16], xmm3 + lea edx, [edx + 32] + sub ecx, 8 + jg convertloop + + ret + } +} +#endif // HAS_ARGBCOPYALPHAROW_SSE2 + +#ifdef HAS_ARGBCOPYALPHAROW_AVX2 +// width in pixels +__declspec(naked) __declspec(align(16)) +void ARGBCopyAlphaRow_AVX2(const uint8* src, uint8* dst, int width) { + __asm { + mov eax, [esp + 4] // src + mov edx, [esp + 8] // dst + mov ecx, [esp + 12] // count + vpcmpeqb ymm0, ymm0, ymm0 + vpsrld ymm0, ymm0, 8 // generate mask 0x00ffffff + + align 4 + convertloop: + vmovdqu ymm1, [eax] + vmovdqu ymm2, [eax + 32] + lea eax, [eax + 64] + vpblendvb ymm1, ymm1, [edx], ymm0 + vpblendvb ymm2, ymm2, [edx + 32], ymm0 + vmovdqu [edx], ymm1 + vmovdqu [edx + 32], ymm2 + lea edx, [edx + 64] + sub ecx, 16 + jg convertloop + + vzeroupper + ret + } +} +#endif // HAS_ARGBCOPYALPHAROW_AVX2 + +#ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2 +// width in pixels +__declspec(naked) __declspec(align(16)) +void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) { + __asm { + mov eax, [esp + 4] // src + mov edx, [esp + 8] // dst + mov ecx, [esp + 12] // count + pcmpeqb xmm0, xmm0 // generate mask 0xff000000 + pslld xmm0, 24 + pcmpeqb xmm1, xmm1 // generate mask 0x00ffffff + psrld xmm1, 8 + + align 4 + convertloop: + movq xmm2, qword ptr [eax] // 8 Y's + lea eax, [eax + 8] + punpcklbw xmm2, xmm2 + punpckhwd xmm3, xmm2 + punpcklwd xmm2, xmm2 + movdqa xmm4, [edx] + movdqa xmm5, [edx + 16] + pand xmm2, xmm0 + pand xmm3, xmm0 + pand xmm4, xmm1 + pand xmm5, xmm1 + por xmm2, xmm4 + por xmm3, xmm5 + movdqa [edx], xmm2 + movdqa [edx + 16], xmm3 + lea edx, [edx + 32] + sub ecx, 8 + jg convertloop + + ret + } +} +#endif // HAS_ARGBCOPYYTOALPHAROW_SSE2 + +#ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2 +// width in pixels +__declspec(naked) __declspec(align(16)) +void ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) { + __asm { + mov eax, [esp + 4] // src + mov edx, [esp + 8] // dst + mov ecx, [esp + 12] // count + vpcmpeqb ymm0, ymm0, ymm0 + vpsrld ymm0, ymm0, 8 // generate mask 0x00ffffff + + align 4 + convertloop: + vpmovzxbd ymm1, qword ptr [eax] + vpmovzxbd ymm2, qword ptr [eax + 8] + lea eax, [eax + 16] + vpslld ymm1, ymm1, 24 + vpslld ymm2, ymm2, 24 + vpblendvb ymm1, ymm1, [edx], ymm0 + vpblendvb ymm2, ymm2, [edx + 32], ymm0 + vmovdqu [edx], ymm1 + vmovdqu [edx + 32], ymm2 + lea edx, [edx + 64] + sub ecx, 16 + jg convertloop + + vzeroupper + ret + } +} +#endif // HAS_ARGBCOPYYTOALPHAROW_AVX2 + +#ifdef HAS_SETROW_X86 +// SetRow8 writes 'count' bytes using a 32 bit value repeated. +__declspec(naked) __declspec(align(16)) +void SetRow_X86(uint8* dst, uint32 v32, int count) { + __asm { + mov edx, edi + mov edi, [esp + 4] // dst + mov eax, [esp + 8] // v32 + mov ecx, [esp + 12] // count + shr ecx, 2 + rep stosd + mov edi, edx + ret + } +} + +// SetRow32 writes 'count' words using a 32 bit value repeated. +__declspec(naked) __declspec(align(16)) +void ARGBSetRows_X86(uint8* dst, uint32 v32, int width, + int dst_stride, int height) { + __asm { + push esi + push edi + push ebp + mov edi, [esp + 12 + 4] // dst + mov eax, [esp + 12 + 8] // v32 + mov ebp, [esp + 12 + 12] // width + mov edx, [esp + 12 + 16] // dst_stride + mov esi, [esp + 12 + 20] // height + lea ecx, [ebp * 4] + sub edx, ecx // stride - width * 4 + + align 4 + convertloop: + mov ecx, ebp + rep stosd + add edi, edx + sub esi, 1 + jg convertloop + + pop ebp + pop edi + pop esi + ret + } +} +#endif // HAS_SETROW_X86 + +#ifdef HAS_YUY2TOYROW_AVX2 +__declspec(naked) __declspec(align(16)) +void YUY2ToYRow_AVX2(const uint8* src_yuy2, + uint8* dst_y, int pix) { + __asm { + mov eax, [esp + 4] // src_yuy2 + mov edx, [esp + 8] // dst_y + mov ecx, [esp + 12] // pix + vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff + vpsrlw ymm5, ymm5, 8 + + align 4 + convertloop: + vmovdqu ymm0, [eax] + vmovdqu ymm1, [eax + 32] + lea eax, [eax + 64] + vpand ymm0, ymm0, ymm5 // even bytes are Y + vpand ymm1, ymm1, ymm5 + vpackuswb ymm0, ymm0, ymm1 // mutates. + vpermq ymm0, ymm0, 0xd8 + sub ecx, 32 + vmovdqu [edx], ymm0 + lea edx, [edx + 32] + jg convertloop + vzeroupper + ret + } +} + +__declspec(naked) __declspec(align(16)) +void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2, + uint8* dst_u, uint8* dst_v, int pix) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // src_yuy2 + mov esi, [esp + 8 + 8] // stride_yuy2 + mov edx, [esp + 8 + 12] // dst_u + mov edi, [esp + 8 + 16] // dst_v + mov ecx, [esp + 8 + 20] // pix + vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff + vpsrlw ymm5, ymm5, 8 + sub edi, edx + + align 4 + convertloop: + vmovdqu ymm0, [eax] + vmovdqu ymm1, [eax + 32] + vpavgb ymm0, ymm0, [eax + esi] + vpavgb ymm1, ymm1, [eax + esi + 32] + lea eax, [eax + 64] + vpsrlw ymm0, ymm0, 8 // YUYV -> UVUV + vpsrlw ymm1, ymm1, 8 + vpackuswb ymm0, ymm0, ymm1 // mutates. + vpermq ymm0, ymm0, 0xd8 + vpand ymm1, ymm0, ymm5 // U + vpsrlw ymm0, ymm0, 8 // V + vpackuswb ymm1, ymm1, ymm1 // mutates. + vpackuswb ymm0, ymm0, ymm0 // mutates. + vpermq ymm1, ymm1, 0xd8 + vpermq ymm0, ymm0, 0xd8 + vextractf128 [edx], ymm1, 0 // U + vextractf128 [edx + edi], ymm0, 0 // V + lea edx, [edx + 16] + sub ecx, 32 + jg convertloop + + pop edi + pop esi + vzeroupper + ret + } +} + +__declspec(naked) __declspec(align(16)) +void YUY2ToUV422Row_AVX2(const uint8* src_yuy2, + uint8* dst_u, uint8* dst_v, int pix) { + __asm { + push edi + mov eax, [esp + 4 + 4] // src_yuy2 + mov edx, [esp + 4 + 8] // dst_u + mov edi, [esp + 4 + 12] // dst_v + mov ecx, [esp + 4 + 16] // pix + vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff + vpsrlw ymm5, ymm5, 8 + sub edi, edx + + align 4 + convertloop: + vmovdqu ymm0, [eax] + vmovdqu ymm1, [eax + 32] + lea eax, [eax + 64] + vpsrlw ymm0, ymm0, 8 // YUYV -> UVUV + vpsrlw ymm1, ymm1, 8 + vpackuswb ymm0, ymm0, ymm1 // mutates. + vpermq ymm0, ymm0, 0xd8 + vpand ymm1, ymm0, ymm5 // U + vpsrlw ymm0, ymm0, 8 // V + vpackuswb ymm1, ymm1, ymm1 // mutates. + vpackuswb ymm0, ymm0, ymm0 // mutates. + vpermq ymm1, ymm1, 0xd8 + vpermq ymm0, ymm0, 0xd8 + vextractf128 [edx], ymm1, 0 // U + vextractf128 [edx + edi], ymm0, 0 // V + lea edx, [edx + 16] + sub ecx, 32 + jg convertloop + + pop edi + vzeroupper + ret + } +} + +__declspec(naked) __declspec(align(16)) +void UYVYToYRow_AVX2(const uint8* src_uyvy, + uint8* dst_y, int pix) { + __asm { + mov eax, [esp + 4] // src_uyvy + mov edx, [esp + 8] // dst_y + mov ecx, [esp + 12] // pix + + align 4 + convertloop: + vmovdqu ymm0, [eax] + vmovdqu ymm1, [eax + 32] + lea eax, [eax + 64] + vpsrlw ymm0, ymm0, 8 // odd bytes are Y + vpsrlw ymm1, ymm1, 8 + vpackuswb ymm0, ymm0, ymm1 // mutates. + vpermq ymm0, ymm0, 0xd8 + sub ecx, 32 + vmovdqu [edx], ymm0 + lea edx, [edx + 32] + jg convertloop + ret + vzeroupper + } +} + +__declspec(naked) __declspec(align(16)) +void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy, + uint8* dst_u, uint8* dst_v, int pix) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // src_yuy2 + mov esi, [esp + 8 + 8] // stride_yuy2 + mov edx, [esp + 8 + 12] // dst_u + mov edi, [esp + 8 + 16] // dst_v + mov ecx, [esp + 8 + 20] // pix + vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff + vpsrlw ymm5, ymm5, 8 + sub edi, edx + + align 4 + convertloop: + vmovdqu ymm0, [eax] + vmovdqu ymm1, [eax + 32] + vpavgb ymm0, ymm0, [eax + esi] + vpavgb ymm1, ymm1, [eax + esi + 32] + lea eax, [eax + 64] + vpand ymm0, ymm0, ymm5 // UYVY -> UVUV + vpand ymm1, ymm1, ymm5 + vpackuswb ymm0, ymm0, ymm1 // mutates. + vpermq ymm0, ymm0, 0xd8 + vpand ymm1, ymm0, ymm5 // U + vpsrlw ymm0, ymm0, 8 // V + vpackuswb ymm1, ymm1, ymm1 // mutates. + vpackuswb ymm0, ymm0, ymm0 // mutates. + vpermq ymm1, ymm1, 0xd8 + vpermq ymm0, ymm0, 0xd8 + vextractf128 [edx], ymm1, 0 // U + vextractf128 [edx + edi], ymm0, 0 // V + lea edx, [edx + 16] + sub ecx, 32 + jg convertloop + + pop edi + pop esi + vzeroupper + ret + } +} + +__declspec(naked) __declspec(align(16)) +void UYVYToUV422Row_AVX2(const uint8* src_uyvy, + uint8* dst_u, uint8* dst_v, int pix) { + __asm { + push edi + mov eax, [esp + 4 + 4] // src_yuy2 + mov edx, [esp + 4 + 8] // dst_u + mov edi, [esp + 4 + 12] // dst_v + mov ecx, [esp + 4 + 16] // pix + vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff + vpsrlw ymm5, ymm5, 8 + sub edi, edx + + align 4 + convertloop: + vmovdqu ymm0, [eax] + vmovdqu ymm1, [eax + 32] + lea eax, [eax + 64] + vpand ymm0, ymm0, ymm5 // UYVY -> UVUV + vpand ymm1, ymm1, ymm5 + vpackuswb ymm0, ymm0, ymm1 // mutates. + vpermq ymm0, ymm0, 0xd8 + vpand ymm1, ymm0, ymm5 // U + vpsrlw ymm0, ymm0, 8 // V + vpackuswb ymm1, ymm1, ymm1 // mutates. + vpackuswb ymm0, ymm0, ymm0 // mutates. + vpermq ymm1, ymm1, 0xd8 + vpermq ymm0, ymm0, 0xd8 + vextractf128 [edx], ymm1, 0 // U + vextractf128 [edx + edi], ymm0, 0 // V + lea edx, [edx + 16] + sub ecx, 32 + jg convertloop + + pop edi + vzeroupper + ret + } +} +#endif // HAS_YUY2TOYROW_AVX2 + +#ifdef HAS_YUY2TOYROW_SSE2 +__declspec(naked) __declspec(align(16)) +void YUY2ToYRow_SSE2(const uint8* src_yuy2, + uint8* dst_y, int pix) { + __asm { + mov eax, [esp + 4] // src_yuy2 + mov edx, [esp + 8] // dst_y + mov ecx, [esp + 12] // pix + pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff + psrlw xmm5, 8 + + align 4 + convertloop: + movdqa xmm0, [eax] + movdqa xmm1, [eax + 16] + lea eax, [eax + 32] + pand xmm0, xmm5 // even bytes are Y + pand xmm1, xmm5 + packuswb xmm0, xmm1 + sub ecx, 16 + movdqa [edx], xmm0 + lea edx, [edx + 16] + jg convertloop + ret + } +} + +__declspec(naked) __declspec(align(16)) +void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2, + uint8* dst_u, uint8* dst_v, int pix) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // src_yuy2 + mov esi, [esp + 8 + 8] // stride_yuy2 + mov edx, [esp + 8 + 12] // dst_u + mov edi, [esp + 8 + 16] // dst_v + mov ecx, [esp + 8 + 20] // pix + pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff + psrlw xmm5, 8 + sub edi, edx + + align 4 + convertloop: + movdqa xmm0, [eax] + movdqa xmm1, [eax + 16] + movdqa xmm2, [eax + esi] + movdqa xmm3, [eax + esi + 16] + lea eax, [eax + 32] + pavgb xmm0, xmm2 + pavgb xmm1, xmm3 + psrlw xmm0, 8 // YUYV -> UVUV + psrlw xmm1, 8 + packuswb xmm0, xmm1 + movdqa xmm1, xmm0 + pand xmm0, xmm5 // U + packuswb xmm0, xmm0 + psrlw xmm1, 8 // V + packuswb xmm1, xmm1 + movq qword ptr [edx], xmm0 + movq qword ptr [edx + edi], xmm1 + lea edx, [edx + 8] + sub ecx, 16 + jg convertloop + + pop edi + pop esi + ret + } +} + +__declspec(naked) __declspec(align(16)) +void YUY2ToUV422Row_SSE2(const uint8* src_yuy2, + uint8* dst_u, uint8* dst_v, int pix) { + __asm { + push edi + mov eax, [esp + 4 + 4] // src_yuy2 + mov edx, [esp + 4 + 8] // dst_u + mov edi, [esp + 4 + 12] // dst_v + mov ecx, [esp + 4 + 16] // pix + pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff + psrlw xmm5, 8 + sub edi, edx + + align 4 + convertloop: + movdqa xmm0, [eax] + movdqa xmm1, [eax + 16] + lea eax, [eax + 32] + psrlw xmm0, 8 // YUYV -> UVUV + psrlw xmm1, 8 + packuswb xmm0, xmm1 + movdqa xmm1, xmm0 + pand xmm0, xmm5 // U + packuswb xmm0, xmm0 + psrlw xmm1, 8 // V + packuswb xmm1, xmm1 + movq qword ptr [edx], xmm0 + movq qword ptr [edx + edi], xmm1 + lea edx, [edx + 8] + sub ecx, 16 + jg convertloop + + pop edi + ret + } +} + +__declspec(naked) __declspec(align(16)) +void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2, + uint8* dst_y, int pix) { + __asm { + mov eax, [esp + 4] // src_yuy2 + mov edx, [esp + 8] // dst_y + mov ecx, [esp + 12] // pix + pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff + psrlw xmm5, 8 + + align 4 + convertloop: + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + lea eax, [eax + 32] + pand xmm0, xmm5 // even bytes are Y + pand xmm1, xmm5 + packuswb xmm0, xmm1 + sub ecx, 16 + movdqu [edx], xmm0 + lea edx, [edx + 16] + jg convertloop + ret + } +} + +__declspec(naked) __declspec(align(16)) +void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2, int stride_yuy2, + uint8* dst_u, uint8* dst_v, int pix) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // src_yuy2 + mov esi, [esp + 8 + 8] // stride_yuy2 + mov edx, [esp + 8 + 12] // dst_u + mov edi, [esp + 8 + 16] // dst_v + mov ecx, [esp + 8 + 20] // pix + pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff + psrlw xmm5, 8 + sub edi, edx + + align 4 + convertloop: + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + movdqu xmm2, [eax + esi] + movdqu xmm3, [eax + esi + 16] + lea eax, [eax + 32] + pavgb xmm0, xmm2 + pavgb xmm1, xmm3 + psrlw xmm0, 8 // YUYV -> UVUV + psrlw xmm1, 8 + packuswb xmm0, xmm1 + movdqa xmm1, xmm0 + pand xmm0, xmm5 // U + packuswb xmm0, xmm0 + psrlw xmm1, 8 // V + packuswb xmm1, xmm1 + movq qword ptr [edx], xmm0 + movq qword ptr [edx + edi], xmm1 + lea edx, [edx + 8] + sub ecx, 16 + jg convertloop + + pop edi + pop esi + ret + } +} + +__declspec(naked) __declspec(align(16)) +void YUY2ToUV422Row_Unaligned_SSE2(const uint8* src_yuy2, + uint8* dst_u, uint8* dst_v, int pix) { + __asm { + push edi + mov eax, [esp + 4 + 4] // src_yuy2 + mov edx, [esp + 4 + 8] // dst_u + mov edi, [esp + 4 + 12] // dst_v + mov ecx, [esp + 4 + 16] // pix + pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff + psrlw xmm5, 8 + sub edi, edx + + align 4 + convertloop: + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + lea eax, [eax + 32] + psrlw xmm0, 8 // YUYV -> UVUV + psrlw xmm1, 8 + packuswb xmm0, xmm1 + movdqa xmm1, xmm0 + pand xmm0, xmm5 // U + packuswb xmm0, xmm0 + psrlw xmm1, 8 // V + packuswb xmm1, xmm1 + movq qword ptr [edx], xmm0 + movq qword ptr [edx + edi], xmm1 + lea edx, [edx + 8] + sub ecx, 16 + jg convertloop + + pop edi + ret + } +} + +__declspec(naked) __declspec(align(16)) +void UYVYToYRow_SSE2(const uint8* src_uyvy, + uint8* dst_y, int pix) { + __asm { + mov eax, [esp + 4] // src_uyvy + mov edx, [esp + 8] // dst_y + mov ecx, [esp + 12] // pix + + align 4 + convertloop: + movdqa xmm0, [eax] + movdqa xmm1, [eax + 16] + lea eax, [eax + 32] + psrlw xmm0, 8 // odd bytes are Y + psrlw xmm1, 8 + packuswb xmm0, xmm1 + sub ecx, 16 + movdqa [edx], xmm0 + lea edx, [edx + 16] + jg convertloop + ret + } +} + +__declspec(naked) __declspec(align(16)) +void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy, + uint8* dst_u, uint8* dst_v, int pix) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // src_yuy2 + mov esi, [esp + 8 + 8] // stride_yuy2 + mov edx, [esp + 8 + 12] // dst_u + mov edi, [esp + 8 + 16] // dst_v + mov ecx, [esp + 8 + 20] // pix + pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff + psrlw xmm5, 8 + sub edi, edx + + align 4 + convertloop: + movdqa xmm0, [eax] + movdqa xmm1, [eax + 16] + movdqa xmm2, [eax + esi] + movdqa xmm3, [eax + esi + 16] + lea eax, [eax + 32] + pavgb xmm0, xmm2 + pavgb xmm1, xmm3 + pand xmm0, xmm5 // UYVY -> UVUV + pand xmm1, xmm5 + packuswb xmm0, xmm1 + movdqa xmm1, xmm0 + pand xmm0, xmm5 // U + packuswb xmm0, xmm0 + psrlw xmm1, 8 // V + packuswb xmm1, xmm1 + movq qword ptr [edx], xmm0 + movq qword ptr [edx + edi], xmm1 + lea edx, [edx + 8] + sub ecx, 16 + jg convertloop + + pop edi + pop esi + ret + } +} + +__declspec(naked) __declspec(align(16)) +void UYVYToUV422Row_SSE2(const uint8* src_uyvy, + uint8* dst_u, uint8* dst_v, int pix) { + __asm { + push edi + mov eax, [esp + 4 + 4] // src_yuy2 + mov edx, [esp + 4 + 8] // dst_u + mov edi, [esp + 4 + 12] // dst_v + mov ecx, [esp + 4 + 16] // pix + pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff + psrlw xmm5, 8 + sub edi, edx + + align 4 + convertloop: + movdqa xmm0, [eax] + movdqa xmm1, [eax + 16] + lea eax, [eax + 32] + pand xmm0, xmm5 // UYVY -> UVUV + pand xmm1, xmm5 + packuswb xmm0, xmm1 + movdqa xmm1, xmm0 + pand xmm0, xmm5 // U + packuswb xmm0, xmm0 + psrlw xmm1, 8 // V + packuswb xmm1, xmm1 + movq qword ptr [edx], xmm0 + movq qword ptr [edx + edi], xmm1 + lea edx, [edx + 8] + sub ecx, 16 + jg convertloop + + pop edi + ret + } +} + +__declspec(naked) __declspec(align(16)) +void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy, + uint8* dst_y, int pix) { + __asm { + mov eax, [esp + 4] // src_uyvy + mov edx, [esp + 8] // dst_y + mov ecx, [esp + 12] // pix + + align 4 + convertloop: + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + lea eax, [eax + 32] + psrlw xmm0, 8 // odd bytes are Y + psrlw xmm1, 8 + packuswb xmm0, xmm1 + sub ecx, 16 + movdqu [edx], xmm0 + lea edx, [edx + 16] + jg convertloop + ret + } +} + +__declspec(naked) __declspec(align(16)) +void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy, + uint8* dst_u, uint8* dst_v, int pix) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // src_yuy2 + mov esi, [esp + 8 + 8] // stride_yuy2 + mov edx, [esp + 8 + 12] // dst_u + mov edi, [esp + 8 + 16] // dst_v + mov ecx, [esp + 8 + 20] // pix + pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff + psrlw xmm5, 8 + sub edi, edx + + align 4 + convertloop: + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + movdqu xmm2, [eax + esi] + movdqu xmm3, [eax + esi + 16] + lea eax, [eax + 32] + pavgb xmm0, xmm2 + pavgb xmm1, xmm3 + pand xmm0, xmm5 // UYVY -> UVUV + pand xmm1, xmm5 + packuswb xmm0, xmm1 + movdqa xmm1, xmm0 + pand xmm0, xmm5 // U + packuswb xmm0, xmm0 + psrlw xmm1, 8 // V + packuswb xmm1, xmm1 + movq qword ptr [edx], xmm0 + movq qword ptr [edx + edi], xmm1 + lea edx, [edx + 8] + sub ecx, 16 + jg convertloop + + pop edi + pop esi + ret + } +} + +__declspec(naked) __declspec(align(16)) +void UYVYToUV422Row_Unaligned_SSE2(const uint8* src_uyvy, + uint8* dst_u, uint8* dst_v, int pix) { + __asm { + push edi + mov eax, [esp + 4 + 4] // src_yuy2 + mov edx, [esp + 4 + 8] // dst_u + mov edi, [esp + 4 + 12] // dst_v + mov ecx, [esp + 4 + 16] // pix + pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff + psrlw xmm5, 8 + sub edi, edx + + align 4 + convertloop: + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + lea eax, [eax + 32] + pand xmm0, xmm5 // UYVY -> UVUV + pand xmm1, xmm5 + packuswb xmm0, xmm1 + movdqa xmm1, xmm0 + pand xmm0, xmm5 // U + packuswb xmm0, xmm0 + psrlw xmm1, 8 // V + packuswb xmm1, xmm1 + movq qword ptr [edx], xmm0 + movq qword ptr [edx + edi], xmm1 + lea edx, [edx + 8] + sub ecx, 16 + jg convertloop + + pop edi + ret + } +} +#endif // HAS_YUY2TOYROW_SSE2 + +#ifdef HAS_ARGBBLENDROW_SSE2 +// Blend 8 pixels at a time. +__declspec(naked) __declspec(align(16)) +void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, + uint8* dst_argb, int width) { + __asm { + push esi + mov eax, [esp + 4 + 4] // src_argb0 + mov esi, [esp + 4 + 8] // src_argb1 + mov edx, [esp + 4 + 12] // dst_argb + mov ecx, [esp + 4 + 16] // width + pcmpeqb xmm7, xmm7 // generate constant 1 + psrlw xmm7, 15 + pcmpeqb xmm6, xmm6 // generate mask 0x00ff00ff + psrlw xmm6, 8 + pcmpeqb xmm5, xmm5 // generate mask 0xff00ff00 + psllw xmm5, 8 + pcmpeqb xmm4, xmm4 // generate mask 0xff000000 + pslld xmm4, 24 + + sub ecx, 1 + je convertloop1 // only 1 pixel? + jl convertloop1b + + // 1 pixel loop until destination pointer is aligned. + alignloop1: + test edx, 15 // aligned? + je alignloop1b + movd xmm3, [eax] + lea eax, [eax + 4] + movdqa xmm0, xmm3 // src argb + pxor xmm3, xmm4 // ~alpha + movd xmm2, [esi] // _r_b + psrlw xmm3, 8 // alpha + pshufhw xmm3, xmm3, 0F5h // 8 alpha words + pshuflw xmm3, xmm3, 0F5h + pand xmm2, xmm6 // _r_b + paddw xmm3, xmm7 // 256 - alpha + pmullw xmm2, xmm3 // _r_b * alpha + movd xmm1, [esi] // _a_g + lea esi, [esi + 4] + psrlw xmm1, 8 // _a_g + por xmm0, xmm4 // set alpha to 255 + pmullw xmm1, xmm3 // _a_g * alpha + psrlw xmm2, 8 // _r_b convert to 8 bits again + paddusb xmm0, xmm2 // + src argb + pand xmm1, xmm5 // a_g_ convert to 8 bits again + paddusb xmm0, xmm1 // + src argb + sub ecx, 1 + movd [edx], xmm0 + lea edx, [edx + 4] + jge alignloop1 + + alignloop1b: + add ecx, 1 - 4 + jl convertloop4b + + // 4 pixel loop. + convertloop4: + movdqu xmm3, [eax] // src argb + lea eax, [eax + 16] + movdqa xmm0, xmm3 // src argb + pxor xmm3, xmm4 // ~alpha + movdqu xmm2, [esi] // _r_b + psrlw xmm3, 8 // alpha + pshufhw xmm3, xmm3, 0F5h // 8 alpha words + pshuflw xmm3, xmm3, 0F5h + pand xmm2, xmm6 // _r_b + paddw xmm3, xmm7 // 256 - alpha + pmullw xmm2, xmm3 // _r_b * alpha + movdqu xmm1, [esi] // _a_g + lea esi, [esi + 16] + psrlw xmm1, 8 // _a_g + por xmm0, xmm4 // set alpha to 255 + pmullw xmm1, xmm3 // _a_g * alpha + psrlw xmm2, 8 // _r_b convert to 8 bits again + paddusb xmm0, xmm2 // + src argb + pand xmm1, xmm5 // a_g_ convert to 8 bits again + paddusb xmm0, xmm1 // + src argb + sub ecx, 4 + movdqa [edx], xmm0 + lea edx, [edx + 16] + jge convertloop4 + + convertloop4b: + add ecx, 4 - 1 + jl convertloop1b + + // 1 pixel loop. + convertloop1: + movd xmm3, [eax] // src argb + lea eax, [eax + 4] + movdqa xmm0, xmm3 // src argb + pxor xmm3, xmm4 // ~alpha + movd xmm2, [esi] // _r_b + psrlw xmm3, 8 // alpha + pshufhw xmm3, xmm3, 0F5h // 8 alpha words + pshuflw xmm3, xmm3, 0F5h + pand xmm2, xmm6 // _r_b + paddw xmm3, xmm7 // 256 - alpha + pmullw xmm2, xmm3 // _r_b * alpha + movd xmm1, [esi] // _a_g + lea esi, [esi + 4] + psrlw xmm1, 8 // _a_g + por xmm0, xmm4 // set alpha to 255 + pmullw xmm1, xmm3 // _a_g * alpha + psrlw xmm2, 8 // _r_b convert to 8 bits again + paddusb xmm0, xmm2 // + src argb + pand xmm1, xmm5 // a_g_ convert to 8 bits again + paddusb xmm0, xmm1 // + src argb + sub ecx, 1 + movd [edx], xmm0 + lea edx, [edx + 4] + jge convertloop1 + + convertloop1b: + pop esi + ret + } +} +#endif // HAS_ARGBBLENDROW_SSE2 + +#ifdef HAS_ARGBBLENDROW_SSSE3 +// Shuffle table for isolating alpha. +static const uvec8 kShuffleAlpha = { + 3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80, + 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80 +}; +// Same as SSE2, but replaces: +// psrlw xmm3, 8 // alpha +// pshufhw xmm3, xmm3, 0F5h // 8 alpha words +// pshuflw xmm3, xmm3, 0F5h +// with.. +// pshufb xmm3, kShuffleAlpha // alpha +// Blend 8 pixels at a time. + +__declspec(naked) __declspec(align(16)) +void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1, + uint8* dst_argb, int width) { + __asm { + push esi + mov eax, [esp + 4 + 4] // src_argb0 + mov esi, [esp + 4 + 8] // src_argb1 + mov edx, [esp + 4 + 12] // dst_argb + mov ecx, [esp + 4 + 16] // width + pcmpeqb xmm7, xmm7 // generate constant 0x0001 + psrlw xmm7, 15 + pcmpeqb xmm6, xmm6 // generate mask 0x00ff00ff + psrlw xmm6, 8 + pcmpeqb xmm5, xmm5 // generate mask 0xff00ff00 + psllw xmm5, 8 + pcmpeqb xmm4, xmm4 // generate mask 0xff000000 + pslld xmm4, 24 + + sub ecx, 1 + je convertloop1 // only 1 pixel? + jl convertloop1b + + // 1 pixel loop until destination pointer is aligned. + alignloop1: + test edx, 15 // aligned? + je alignloop1b + movd xmm3, [eax] + lea eax, [eax + 4] + movdqa xmm0, xmm3 // src argb + pxor xmm3, xmm4 // ~alpha + movd xmm2, [esi] // _r_b + pshufb xmm3, kShuffleAlpha // alpha + pand xmm2, xmm6 // _r_b + paddw xmm3, xmm7 // 256 - alpha + pmullw xmm2, xmm3 // _r_b * alpha + movd xmm1, [esi] // _a_g + lea esi, [esi + 4] + psrlw xmm1, 8 // _a_g + por xmm0, xmm4 // set alpha to 255 + pmullw xmm1, xmm3 // _a_g * alpha + psrlw xmm2, 8 // _r_b convert to 8 bits again + paddusb xmm0, xmm2 // + src argb + pand xmm1, xmm5 // a_g_ convert to 8 bits again + paddusb xmm0, xmm1 // + src argb + sub ecx, 1 + movd [edx], xmm0 + lea edx, [edx + 4] + jge alignloop1 + + alignloop1b: + add ecx, 1 - 4 + jl convertloop4b + + test eax, 15 // unaligned? + jne convertuloop4 + test esi, 15 // unaligned? + jne convertuloop4 + + // 4 pixel loop. + convertloop4: + movdqa xmm3, [eax] // src argb + lea eax, [eax + 16] + movdqa xmm0, xmm3 // src argb + pxor xmm3, xmm4 // ~alpha + movdqa xmm2, [esi] // _r_b + pshufb xmm3, kShuffleAlpha // alpha + pand xmm2, xmm6 // _r_b + paddw xmm3, xmm7 // 256 - alpha + pmullw xmm2, xmm3 // _r_b * alpha + movdqa xmm1, [esi] // _a_g + lea esi, [esi + 16] + psrlw xmm1, 8 // _a_g + por xmm0, xmm4 // set alpha to 255 + pmullw xmm1, xmm3 // _a_g * alpha + psrlw xmm2, 8 // _r_b convert to 8 bits again + paddusb xmm0, xmm2 // + src argb + pand xmm1, xmm5 // a_g_ convert to 8 bits again + paddusb xmm0, xmm1 // + src argb + sub ecx, 4 + movdqa [edx], xmm0 + lea edx, [edx + 16] + jge convertloop4 + jmp convertloop4b + + // 4 pixel unaligned loop. + convertuloop4: + movdqu xmm3, [eax] // src argb + lea eax, [eax + 16] + movdqa xmm0, xmm3 // src argb + pxor xmm3, xmm4 // ~alpha + movdqu xmm2, [esi] // _r_b + pshufb xmm3, kShuffleAlpha // alpha + pand xmm2, xmm6 // _r_b + paddw xmm3, xmm7 // 256 - alpha + pmullw xmm2, xmm3 // _r_b * alpha + movdqu xmm1, [esi] // _a_g + lea esi, [esi + 16] + psrlw xmm1, 8 // _a_g + por xmm0, xmm4 // set alpha to 255 + pmullw xmm1, xmm3 // _a_g * alpha + psrlw xmm2, 8 // _r_b convert to 8 bits again + paddusb xmm0, xmm2 // + src argb + pand xmm1, xmm5 // a_g_ convert to 8 bits again + paddusb xmm0, xmm1 // + src argb + sub ecx, 4 + movdqa [edx], xmm0 + lea edx, [edx + 16] + jge convertuloop4 + + convertloop4b: + add ecx, 4 - 1 + jl convertloop1b + + // 1 pixel loop. + convertloop1: + movd xmm3, [eax] // src argb + lea eax, [eax + 4] + movdqa xmm0, xmm3 // src argb + pxor xmm3, xmm4 // ~alpha + movd xmm2, [esi] // _r_b + pshufb xmm3, kShuffleAlpha // alpha + pand xmm2, xmm6 // _r_b + paddw xmm3, xmm7 // 256 - alpha + pmullw xmm2, xmm3 // _r_b * alpha + movd xmm1, [esi] // _a_g + lea esi, [esi + 4] + psrlw xmm1, 8 // _a_g + por xmm0, xmm4 // set alpha to 255 + pmullw xmm1, xmm3 // _a_g * alpha + psrlw xmm2, 8 // _r_b convert to 8 bits again + paddusb xmm0, xmm2 // + src argb + pand xmm1, xmm5 // a_g_ convert to 8 bits again + paddusb xmm0, xmm1 // + src argb + sub ecx, 1 + movd [edx], xmm0 + lea edx, [edx + 4] + jge convertloop1 + + convertloop1b: + pop esi + ret + } +} +#endif // HAS_ARGBBLENDROW_SSSE3 + +#ifdef HAS_ARGBATTENUATEROW_SSE2 +// Attenuate 4 pixels at a time. +// Aligned to 16 bytes. +__declspec(naked) __declspec(align(16)) +void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) { + __asm { + mov eax, [esp + 4] // src_argb0 + mov edx, [esp + 8] // dst_argb + mov ecx, [esp + 12] // width + pcmpeqb xmm4, xmm4 // generate mask 0xff000000 + pslld xmm4, 24 + pcmpeqb xmm5, xmm5 // generate mask 0x00ffffff + psrld xmm5, 8 + + align 4 + convertloop: + movdqa xmm0, [eax] // read 4 pixels + punpcklbw xmm0, xmm0 // first 2 + pshufhw xmm2, xmm0, 0FFh // 8 alpha words + pshuflw xmm2, xmm2, 0FFh + pmulhuw xmm0, xmm2 // rgb * a + movdqa xmm1, [eax] // read 4 pixels + punpckhbw xmm1, xmm1 // next 2 pixels + pshufhw xmm2, xmm1, 0FFh // 8 alpha words + pshuflw xmm2, xmm2, 0FFh + pmulhuw xmm1, xmm2 // rgb * a + movdqa xmm2, [eax] // alphas + lea eax, [eax + 16] + psrlw xmm0, 8 + pand xmm2, xmm4 + psrlw xmm1, 8 + packuswb xmm0, xmm1 + pand xmm0, xmm5 // keep original alphas + por xmm0, xmm2 + sub ecx, 4 + movdqa [edx], xmm0 + lea edx, [edx + 16] + jg convertloop + + ret + } +} +#endif // HAS_ARGBATTENUATEROW_SSE2 + +#ifdef HAS_ARGBATTENUATEROW_SSSE3 +// Shuffle table duplicating alpha. +static const uvec8 kShuffleAlpha0 = { + 3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u, +}; +static const uvec8 kShuffleAlpha1 = { + 11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u, + 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u, +}; +__declspec(naked) __declspec(align(16)) +void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { + __asm { + mov eax, [esp + 4] // src_argb0 + mov edx, [esp + 8] // dst_argb + mov ecx, [esp + 12] // width + pcmpeqb xmm3, xmm3 // generate mask 0xff000000 + pslld xmm3, 24 + movdqa xmm4, kShuffleAlpha0 + movdqa xmm5, kShuffleAlpha1 + + align 4 + convertloop: + movdqu xmm0, [eax] // read 4 pixels + pshufb xmm0, xmm4 // isolate first 2 alphas + movdqu xmm1, [eax] // read 4 pixels + punpcklbw xmm1, xmm1 // first 2 pixel rgbs + pmulhuw xmm0, xmm1 // rgb * a + movdqu xmm1, [eax] // read 4 pixels + pshufb xmm1, xmm5 // isolate next 2 alphas + movdqu xmm2, [eax] // read 4 pixels + punpckhbw xmm2, xmm2 // next 2 pixel rgbs + pmulhuw xmm1, xmm2 // rgb * a + movdqu xmm2, [eax] // mask original alpha + lea eax, [eax + 16] + pand xmm2, xmm3 + psrlw xmm0, 8 + psrlw xmm1, 8 + packuswb xmm0, xmm1 + por xmm0, xmm2 // copy original alpha + sub ecx, 4 + movdqu [edx], xmm0 + lea edx, [edx + 16] + jg convertloop + + ret + } +} +#endif // HAS_ARGBATTENUATEROW_SSSE3 + +#ifdef HAS_ARGBATTENUATEROW_AVX2 +// Shuffle table duplicating alpha. +static const ulvec8 kShuffleAlpha_AVX2 = { + 6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u, + 14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u, + 6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u, + 14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u, +}; +__declspec(naked) __declspec(align(16)) +void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) { + __asm { + mov eax, [esp + 4] // src_argb0 + mov edx, [esp + 8] // dst_argb + mov ecx, [esp + 12] // width + sub edx, eax + vmovdqa ymm4, kShuffleAlpha_AVX2 + vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff000000 + vpslld ymm5, ymm5, 24 + + align 4 + convertloop: + vmovdqu ymm6, [eax] // read 8 pixels. + vpunpcklbw ymm0, ymm6, ymm6 // low 4 pixels. mutated. + vpunpckhbw ymm1, ymm6, ymm6 // high 4 pixels. mutated. + vpshufb ymm2, ymm0, ymm4 // low 4 alphas + vpshufb ymm3, ymm1, ymm4 // high 4 alphas + vpmulhuw ymm0, ymm0, ymm2 // rgb * a + vpmulhuw ymm1, ymm1, ymm3 // rgb * a + vpand ymm6, ymm6, ymm5 // isolate alpha + vpsrlw ymm0, ymm0, 8 + vpsrlw ymm1, ymm1, 8 + vpackuswb ymm0, ymm0, ymm1 // unmutated. + vpor ymm0, ymm0, ymm6 // copy original alpha + sub ecx, 8 + vmovdqu [eax + edx], ymm0 + lea eax, [eax + 32] + jg convertloop + + vzeroupper + ret + } +} +#endif // HAS_ARGBATTENUATEROW_AVX2 + +#ifdef HAS_ARGBUNATTENUATEROW_SSE2 +// Unattenuate 4 pixels at a time. +// Aligned to 16 bytes. +__declspec(naked) __declspec(align(16)) +void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, + int width) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // src_argb0 + mov edx, [esp + 8 + 8] // dst_argb + mov ecx, [esp + 8 + 12] // width + + align 4 + convertloop: + movdqu xmm0, [eax] // read 4 pixels + movzx esi, byte ptr [eax + 3] // first alpha + movzx edi, byte ptr [eax + 7] // second alpha + punpcklbw xmm0, xmm0 // first 2 + movd xmm2, dword ptr fixed_invtbl8[esi * 4] + movd xmm3, dword ptr fixed_invtbl8[edi * 4] + pshuflw xmm2, xmm2, 040h // first 4 inv_alpha words. 1, a, a, a + pshuflw xmm3, xmm3, 040h // next 4 inv_alpha words + movlhps xmm2, xmm3 + pmulhuw xmm0, xmm2 // rgb * a + + movdqu xmm1, [eax] // read 4 pixels + movzx esi, byte ptr [eax + 11] // third alpha + movzx edi, byte ptr [eax + 15] // forth alpha + punpckhbw xmm1, xmm1 // next 2 + movd xmm2, dword ptr fixed_invtbl8[esi * 4] + movd xmm3, dword ptr fixed_invtbl8[edi * 4] + pshuflw xmm2, xmm2, 040h // first 4 inv_alpha words + pshuflw xmm3, xmm3, 040h // next 4 inv_alpha words + movlhps xmm2, xmm3 + pmulhuw xmm1, xmm2 // rgb * a + lea eax, [eax + 16] + + packuswb xmm0, xmm1 + sub ecx, 4 + movdqu [edx], xmm0 + lea edx, [edx + 16] + jg convertloop + pop edi + pop esi + ret + } +} +#endif // HAS_ARGBUNATTENUATEROW_SSE2 + +#ifdef HAS_ARGBUNATTENUATEROW_AVX2 +// Shuffle table duplicating alpha. +static const ulvec8 kUnattenShuffleAlpha_AVX2 = { + 0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15, + 0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15, +}; +// TODO(fbarchard): Enable USE_GATHER for future hardware if faster. +// USE_GATHER is not on by default, due to being a slow instruction. +#ifdef USE_GATHER +__declspec(naked) __declspec(align(16)) +void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, + int width) { + __asm { + mov eax, [esp + 4] // src_argb0 + mov edx, [esp + 8] // dst_argb + mov ecx, [esp + 12] // width + sub edx, eax + vmovdqa ymm4, kUnattenShuffleAlpha_AVX2 + + align 4 + convertloop: + vmovdqu ymm6, [eax] // read 8 pixels. + vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xffffffff for gather. + vpsrld ymm2, ymm6, 24 // alpha in low 8 bits. + vpunpcklbw ymm0, ymm6, ymm6 // low 4 pixels. mutated. + vpunpckhbw ymm1, ymm6, ymm6 // high 4 pixels. mutated. + vpgatherdd ymm3, [ymm2 * 4 + fixed_invtbl8], ymm5 // ymm5 cleared. 1, a + vpunpcklwd ymm2, ymm3, ymm3 // low 4 inverted alphas. mutated. 1, 1, a, a + vpunpckhwd ymm3, ymm3, ymm3 // high 4 inverted alphas. mutated. + vpshufb ymm2, ymm2, ymm4 // replicate low 4 alphas. 1, a, a, a + vpshufb ymm3, ymm3, ymm4 // replicate high 4 alphas + vpmulhuw ymm0, ymm0, ymm2 // rgb * ia + vpmulhuw ymm1, ymm1, ymm3 // rgb * ia + vpackuswb ymm0, ymm0, ymm1 // unmutated. + sub ecx, 8 + vmovdqu [eax + edx], ymm0 + lea eax, [eax + 32] + jg convertloop + + vzeroupper + ret + } +} +#else // USE_GATHER +__declspec(naked) __declspec(align(16)) +void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, + int width) { + __asm { + + mov eax, [esp + 4] // src_argb0 + mov edx, [esp + 8] // dst_argb + mov ecx, [esp + 12] // width + sub edx, eax + vmovdqa ymm5, kUnattenShuffleAlpha_AVX2 + + push esi + push edi + + align 4 + convertloop: + // replace VPGATHER + movzx esi, byte ptr [eax + 3] // alpha0 + movzx edi, byte ptr [eax + 7] // alpha1 + vmovd xmm0, dword ptr fixed_invtbl8[esi * 4] // [1,a0] + vmovd xmm1, dword ptr fixed_invtbl8[edi * 4] // [1,a1] + movzx esi, byte ptr [eax + 11] // alpha2 + movzx edi, byte ptr [eax + 15] // alpha3 + vpunpckldq xmm6, xmm0, xmm1 // [1,a1,1,a0] + vmovd xmm2, dword ptr fixed_invtbl8[esi * 4] // [1,a2] + vmovd xmm3, dword ptr fixed_invtbl8[edi * 4] // [1,a3] + movzx esi, byte ptr [eax + 19] // alpha4 + movzx edi, byte ptr [eax + 23] // alpha5 + vpunpckldq xmm7, xmm2, xmm3 // [1,a3,1,a2] + vmovd xmm0, dword ptr fixed_invtbl8[esi * 4] // [1,a4] + vmovd xmm1, dword ptr fixed_invtbl8[edi * 4] // [1,a5] + movzx esi, byte ptr [eax + 27] // alpha6 + movzx edi, byte ptr [eax + 31] // alpha7 + vpunpckldq xmm0, xmm0, xmm1 // [1,a5,1,a4] + vmovd xmm2, dword ptr fixed_invtbl8[esi * 4] // [1,a6] + vmovd xmm3, dword ptr fixed_invtbl8[edi * 4] // [1,a7] + vpunpckldq xmm2, xmm2, xmm3 // [1,a7,1,a6] + vpunpcklqdq xmm3, xmm6, xmm7 // [1,a3,1,a2,1,a1,1,a0] + vpunpcklqdq xmm0, xmm0, xmm2 // [1,a7,1,a6,1,a5,1,a4] + vinserti128 ymm3, ymm3, xmm0, 1 // [1,a7,1,a6,1,a5,1,a4,1,a3,1,a2,1,a1,1,a0] + // end of VPGATHER + + vmovdqu ymm6, [eax] // read 8 pixels. + vpunpcklbw ymm0, ymm6, ymm6 // low 4 pixels. mutated. + vpunpckhbw ymm1, ymm6, ymm6 // high 4 pixels. mutated. + vpunpcklwd ymm2, ymm3, ymm3 // low 4 inverted alphas. mutated. 1, 1, a, a + vpunpckhwd ymm3, ymm3, ymm3 // high 4 inverted alphas. mutated. + vpshufb ymm2, ymm2, ymm5 // replicate low 4 alphas. 1, a, a, a + vpshufb ymm3, ymm3, ymm5 // replicate high 4 alphas + vpmulhuw ymm0, ymm0, ymm2 // rgb * ia + vpmulhuw ymm1, ymm1, ymm3 // rgb * ia + vpackuswb ymm0, ymm0, ymm1 // unmutated. + sub ecx, 8 + vmovdqu [eax + edx], ymm0 + lea eax, [eax + 32] + jg convertloop + + pop edi + pop esi + vzeroupper + ret + } +} +#endif // USE_GATHER +#endif // HAS_ARGBATTENUATEROW_AVX2 + +#ifdef HAS_ARGBGRAYROW_SSSE3 +// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels. +__declspec(naked) __declspec(align(16)) +void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { + __asm { + mov eax, [esp + 4] /* src_argb */ + mov edx, [esp + 8] /* dst_argb */ + mov ecx, [esp + 12] /* width */ + movdqa xmm4, kARGBToYJ + movdqa xmm5, kAddYJ64 + + align 4 + convertloop: + movdqa xmm0, [eax] // G + movdqa xmm1, [eax + 16] + pmaddubsw xmm0, xmm4 + pmaddubsw xmm1, xmm4 + phaddw xmm0, xmm1 + paddw xmm0, xmm5 // Add .5 for rounding. + psrlw xmm0, 7 + packuswb xmm0, xmm0 // 8 G bytes + movdqa xmm2, [eax] // A + movdqa xmm3, [eax + 16] + lea eax, [eax + 32] + psrld xmm2, 24 + psrld xmm3, 24 + packuswb xmm2, xmm3 + packuswb xmm2, xmm2 // 8 A bytes + movdqa xmm3, xmm0 // Weave into GG, GA, then GGGA + punpcklbw xmm0, xmm0 // 8 GG words + punpcklbw xmm3, xmm2 // 8 GA words + movdqa xmm1, xmm0 + punpcklwd xmm0, xmm3 // GGGA first 4 + punpckhwd xmm1, xmm3 // GGGA next 4 + sub ecx, 8 + movdqa [edx], xmm0 + movdqa [edx + 16], xmm1 + lea edx, [edx + 32] + jg convertloop + ret + } +} +#endif // HAS_ARGBGRAYROW_SSSE3 + +#ifdef HAS_ARGBSEPIAROW_SSSE3 +// b = (r * 35 + g * 68 + b * 17) >> 7 +// g = (r * 45 + g * 88 + b * 22) >> 7 +// r = (r * 50 + g * 98 + b * 24) >> 7 +// Constant for ARGB color to sepia tone. +static const vec8 kARGBToSepiaB = { + 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0 +}; + +static const vec8 kARGBToSepiaG = { + 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0 +}; + +static const vec8 kARGBToSepiaR = { + 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0 +}; + +// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels. +__declspec(naked) __declspec(align(16)) +void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) { + __asm { + mov eax, [esp + 4] /* dst_argb */ + mov ecx, [esp + 8] /* width */ + movdqa xmm2, kARGBToSepiaB + movdqa xmm3, kARGBToSepiaG + movdqa xmm4, kARGBToSepiaR + + align 4 + convertloop: + movdqa xmm0, [eax] // B + movdqa xmm6, [eax + 16] + pmaddubsw xmm0, xmm2 + pmaddubsw xmm6, xmm2 + phaddw xmm0, xmm6 + psrlw xmm0, 7 + packuswb xmm0, xmm0 // 8 B values + movdqa xmm5, [eax] // G + movdqa xmm1, [eax + 16] + pmaddubsw xmm5, xmm3 + pmaddubsw xmm1, xmm3 + phaddw xmm5, xmm1 + psrlw xmm5, 7 + packuswb xmm5, xmm5 // 8 G values + punpcklbw xmm0, xmm5 // 8 BG values + movdqa xmm5, [eax] // R + movdqa xmm1, [eax + 16] + pmaddubsw xmm5, xmm4 + pmaddubsw xmm1, xmm4 + phaddw xmm5, xmm1 + psrlw xmm5, 7 + packuswb xmm5, xmm5 // 8 R values + movdqa xmm6, [eax] // A + movdqa xmm1, [eax + 16] + psrld xmm6, 24 + psrld xmm1, 24 + packuswb xmm6, xmm1 + packuswb xmm6, xmm6 // 8 A values + punpcklbw xmm5, xmm6 // 8 RA values + movdqa xmm1, xmm0 // Weave BG, RA together + punpcklwd xmm0, xmm5 // BGRA first 4 + punpckhwd xmm1, xmm5 // BGRA next 4 + sub ecx, 8 + movdqa [eax], xmm0 + movdqa [eax + 16], xmm1 + lea eax, [eax + 32] + jg convertloop + ret + } +} +#endif // HAS_ARGBSEPIAROW_SSSE3 + +#ifdef HAS_ARGBCOLORMATRIXROW_SSSE3 +// Tranform 8 ARGB pixels (32 bytes) with color matrix. +// Same as Sepia except matrix is provided. +// TODO(fbarchard): packuswbs only use half of the reg. To make RGBA, combine R +// and B into a high and low, then G/A, unpackl/hbw and then unpckl/hwd. +__declspec(naked) __declspec(align(16)) +void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb, + const int8* matrix_argb, int width) { + __asm { + mov eax, [esp + 4] /* src_argb */ + mov edx, [esp + 8] /* dst_argb */ + mov ecx, [esp + 12] /* matrix_argb */ + movdqu xmm5, [ecx] + pshufd xmm2, xmm5, 0x00 + pshufd xmm3, xmm5, 0x55 + pshufd xmm4, xmm5, 0xaa + pshufd xmm5, xmm5, 0xff + mov ecx, [esp + 16] /* width */ + + align 4 + convertloop: + movdqa xmm0, [eax] // B + movdqa xmm7, [eax + 16] + pmaddubsw xmm0, xmm2 + pmaddubsw xmm7, xmm2 + movdqa xmm6, [eax] // G + movdqa xmm1, [eax + 16] + pmaddubsw xmm6, xmm3 + pmaddubsw xmm1, xmm3 + phaddsw xmm0, xmm7 // B + phaddsw xmm6, xmm1 // G + psraw xmm0, 6 // B + psraw xmm6, 6 // G + packuswb xmm0, xmm0 // 8 B values + packuswb xmm6, xmm6 // 8 G values + punpcklbw xmm0, xmm6 // 8 BG values + movdqa xmm1, [eax] // R + movdqa xmm7, [eax + 16] + pmaddubsw xmm1, xmm4 + pmaddubsw xmm7, xmm4 + phaddsw xmm1, xmm7 // R + movdqa xmm6, [eax] // A + movdqa xmm7, [eax + 16] + pmaddubsw xmm6, xmm5 + pmaddubsw xmm7, xmm5 + phaddsw xmm6, xmm7 // A + psraw xmm1, 6 // R + psraw xmm6, 6 // A + packuswb xmm1, xmm1 // 8 R values + packuswb xmm6, xmm6 // 8 A values + punpcklbw xmm1, xmm6 // 8 RA values + movdqa xmm6, xmm0 // Weave BG, RA together + punpcklwd xmm0, xmm1 // BGRA first 4 + punpckhwd xmm6, xmm1 // BGRA next 4 + sub ecx, 8 + movdqa [edx], xmm0 + movdqa [edx + 16], xmm6 + lea eax, [eax + 32] + lea edx, [edx + 32] + jg convertloop + ret + } +} +#endif // HAS_ARGBCOLORMATRIXROW_SSSE3 + +#ifdef HAS_ARGBQUANTIZEROW_SSE2 +// Quantize 4 ARGB pixels (16 bytes). +// Aligned to 16 bytes. +__declspec(naked) __declspec(align(16)) +void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size, + int interval_offset, int width) { + __asm { + mov eax, [esp + 4] /* dst_argb */ + movd xmm2, [esp + 8] /* scale */ + movd xmm3, [esp + 12] /* interval_size */ + movd xmm4, [esp + 16] /* interval_offset */ + mov ecx, [esp + 20] /* width */ + pshuflw xmm2, xmm2, 040h + pshufd xmm2, xmm2, 044h + pshuflw xmm3, xmm3, 040h + pshufd xmm3, xmm3, 044h + pshuflw xmm4, xmm4, 040h + pshufd xmm4, xmm4, 044h + pxor xmm5, xmm5 // constant 0 + pcmpeqb xmm6, xmm6 // generate mask 0xff000000 + pslld xmm6, 24 + + align 4 + convertloop: + movdqa xmm0, [eax] // read 4 pixels + punpcklbw xmm0, xmm5 // first 2 pixels + pmulhuw xmm0, xmm2 // pixel * scale >> 16 + movdqa xmm1, [eax] // read 4 pixels + punpckhbw xmm1, xmm5 // next 2 pixels + pmulhuw xmm1, xmm2 + pmullw xmm0, xmm3 // * interval_size + movdqa xmm7, [eax] // read 4 pixels + pmullw xmm1, xmm3 + pand xmm7, xmm6 // mask alpha + paddw xmm0, xmm4 // + interval_size / 2 + paddw xmm1, xmm4 + packuswb xmm0, xmm1 + por xmm0, xmm7 + sub ecx, 4 + movdqa [eax], xmm0 + lea eax, [eax + 16] + jg convertloop + ret + } +} +#endif // HAS_ARGBQUANTIZEROW_SSE2 + +#ifdef HAS_ARGBSHADEROW_SSE2 +// Shade 4 pixels at a time by specified value. +// Aligned to 16 bytes. +__declspec(naked) __declspec(align(16)) +void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width, + uint32 value) { + __asm { + mov eax, [esp + 4] // src_argb + mov edx, [esp + 8] // dst_argb + mov ecx, [esp + 12] // width + movd xmm2, [esp + 16] // value + punpcklbw xmm2, xmm2 + punpcklqdq xmm2, xmm2 + + align 4 + convertloop: + movdqa xmm0, [eax] // read 4 pixels + lea eax, [eax + 16] + movdqa xmm1, xmm0 + punpcklbw xmm0, xmm0 // first 2 + punpckhbw xmm1, xmm1 // next 2 + pmulhuw xmm0, xmm2 // argb * value + pmulhuw xmm1, xmm2 // argb * value + psrlw xmm0, 8 + psrlw xmm1, 8 + packuswb xmm0, xmm1 + sub ecx, 4 + movdqa [edx], xmm0 + lea edx, [edx + 16] + jg convertloop + + ret + } +} +#endif // HAS_ARGBSHADEROW_SSE2 + +#ifdef HAS_ARGBMULTIPLYROW_SSE2 +// Multiply 2 rows of ARGB pixels together, 4 pixels at a time. +__declspec(naked) __declspec(align(16)) +void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, + uint8* dst_argb, int width) { + __asm { + push esi + mov eax, [esp + 4 + 4] // src_argb0 + mov esi, [esp + 4 + 8] // src_argb1 + mov edx, [esp + 4 + 12] // dst_argb + mov ecx, [esp + 4 + 16] // width + pxor xmm5, xmm5 // constant 0 + + align 4 + convertloop: + movdqu xmm0, [eax] // read 4 pixels from src_argb0 + movdqu xmm2, [esi] // read 4 pixels from src_argb1 + movdqu xmm1, xmm0 + movdqu xmm3, xmm2 + punpcklbw xmm0, xmm0 // first 2 + punpckhbw xmm1, xmm1 // next 2 + punpcklbw xmm2, xmm5 // first 2 + punpckhbw xmm3, xmm5 // next 2 + pmulhuw xmm0, xmm2 // src_argb0 * src_argb1 first 2 + pmulhuw xmm1, xmm3 // src_argb0 * src_argb1 next 2 + lea eax, [eax + 16] + lea esi, [esi + 16] + packuswb xmm0, xmm1 + sub ecx, 4 + movdqu [edx], xmm0 + lea edx, [edx + 16] + jg convertloop + + pop esi + ret + } +} +#endif // HAS_ARGBMULTIPLYROW_SSE2 + +#ifdef HAS_ARGBADDROW_SSE2 +// Add 2 rows of ARGB pixels together, 4 pixels at a time. +// TODO(fbarchard): Port this to posix, neon and other math functions. +__declspec(naked) __declspec(align(16)) +void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, + uint8* dst_argb, int width) { + __asm { + push esi + mov eax, [esp + 4 + 4] // src_argb0 + mov esi, [esp + 4 + 8] // src_argb1 + mov edx, [esp + 4 + 12] // dst_argb + mov ecx, [esp + 4 + 16] // width + + sub ecx, 4 + jl convertloop49 + + align 4 + convertloop4: + movdqu xmm0, [eax] // read 4 pixels from src_argb0 + lea eax, [eax + 16] + movdqu xmm1, [esi] // read 4 pixels from src_argb1 + lea esi, [esi + 16] + paddusb xmm0, xmm1 // src_argb0 + src_argb1 + sub ecx, 4 + movdqu [edx], xmm0 + lea edx, [edx + 16] + jge convertloop4 + + convertloop49: + add ecx, 4 - 1 + jl convertloop19 + + convertloop1: + movd xmm0, [eax] // read 1 pixels from src_argb0 + lea eax, [eax + 4] + movd xmm1, [esi] // read 1 pixels from src_argb1 + lea esi, [esi + 4] + paddusb xmm0, xmm1 // src_argb0 + src_argb1 + sub ecx, 1 + movd [edx], xmm0 + lea edx, [edx + 4] + jge convertloop1 + + convertloop19: + pop esi + ret + } +} +#endif // HAS_ARGBADDROW_SSE2 + +#ifdef HAS_ARGBSUBTRACTROW_SSE2 +// Subtract 2 rows of ARGB pixels together, 4 pixels at a time. +__declspec(naked) __declspec(align(16)) +void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, + uint8* dst_argb, int width) { + __asm { + push esi + mov eax, [esp + 4 + 4] // src_argb0 + mov esi, [esp + 4 + 8] // src_argb1 + mov edx, [esp + 4 + 12] // dst_argb + mov ecx, [esp + 4 + 16] // width + + align 4 + convertloop: + movdqu xmm0, [eax] // read 4 pixels from src_argb0 + lea eax, [eax + 16] + movdqu xmm1, [esi] // read 4 pixels from src_argb1 + lea esi, [esi + 16] + psubusb xmm0, xmm1 // src_argb0 - src_argb1 + sub ecx, 4 + movdqu [edx], xmm0 + lea edx, [edx + 16] + jg convertloop + + pop esi + ret + } +} +#endif // HAS_ARGBSUBTRACTROW_SSE2 + +#ifdef HAS_ARGBMULTIPLYROW_AVX2 +// Multiply 2 rows of ARGB pixels together, 8 pixels at a time. +__declspec(naked) __declspec(align(16)) +void ARGBMultiplyRow_AVX2(const uint8* src_argb0, const uint8* src_argb1, + uint8* dst_argb, int width) { + __asm { + push esi + mov eax, [esp + 4 + 4] // src_argb0 + mov esi, [esp + 4 + 8] // src_argb1 + mov edx, [esp + 4 + 12] // dst_argb + mov ecx, [esp + 4 + 16] // width + vpxor ymm5, ymm5, ymm5 // constant 0 + + align 4 + convertloop: + vmovdqu ymm1, [eax] // read 8 pixels from src_argb0 + lea eax, [eax + 32] + vmovdqu ymm3, [esi] // read 8 pixels from src_argb1 + lea esi, [esi + 32] + vpunpcklbw ymm0, ymm1, ymm1 // low 4 + vpunpckhbw ymm1, ymm1, ymm1 // high 4 + vpunpcklbw ymm2, ymm3, ymm5 // low 4 + vpunpckhbw ymm3, ymm3, ymm5 // high 4 + vpmulhuw ymm0, ymm0, ymm2 // src_argb0 * src_argb1 low 4 + vpmulhuw ymm1, ymm1, ymm3 // src_argb0 * src_argb1 high 4 + vpackuswb ymm0, ymm0, ymm1 + vmovdqu [edx], ymm0 + lea edx, [edx + 32] + sub ecx, 8 + jg convertloop + + pop esi + vzeroupper + ret + } +} +#endif // HAS_ARGBMULTIPLYROW_AVX2 + +#ifdef HAS_ARGBADDROW_AVX2 +// Add 2 rows of ARGB pixels together, 8 pixels at a time. +__declspec(naked) __declspec(align(16)) +void ARGBAddRow_AVX2(const uint8* src_argb0, const uint8* src_argb1, + uint8* dst_argb, int width) { + __asm { + push esi + mov eax, [esp + 4 + 4] // src_argb0 + mov esi, [esp + 4 + 8] // src_argb1 + mov edx, [esp + 4 + 12] // dst_argb + mov ecx, [esp + 4 + 16] // width + + align 4 + convertloop: + vmovdqu ymm0, [eax] // read 8 pixels from src_argb0 + lea eax, [eax + 32] + vpaddusb ymm0, ymm0, [esi] // add 8 pixels from src_argb1 + lea esi, [esi + 32] + vmovdqu [edx], ymm0 + lea edx, [edx + 32] + sub ecx, 8 + jg convertloop + + pop esi + vzeroupper + ret + } +} +#endif // HAS_ARGBADDROW_AVX2 + +#ifdef HAS_ARGBSUBTRACTROW_AVX2 +// Subtract 2 rows of ARGB pixels together, 8 pixels at a time. +__declspec(naked) __declspec(align(16)) +void ARGBSubtractRow_AVX2(const uint8* src_argb0, const uint8* src_argb1, + uint8* dst_argb, int width) { + __asm { + push esi + mov eax, [esp + 4 + 4] // src_argb0 + mov esi, [esp + 4 + 8] // src_argb1 + mov edx, [esp + 4 + 12] // dst_argb + mov ecx, [esp + 4 + 16] // width + + align 4 + convertloop: + vmovdqu ymm0, [eax] // read 8 pixels from src_argb0 + lea eax, [eax + 32] + vpsubusb ymm0, ymm0, [esi] // src_argb0 - src_argb1 + lea esi, [esi + 32] + vmovdqu [edx], ymm0 + lea edx, [edx + 32] + sub ecx, 8 + jg convertloop + + pop esi + vzeroupper + ret + } +} +#endif // HAS_ARGBSUBTRACTROW_AVX2 + +#ifdef HAS_SOBELXROW_SSE2 +// SobelX as a matrix is +// -1 0 1 +// -2 0 2 +// -1 0 1 +__declspec(naked) __declspec(align(16)) +void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1, + const uint8* src_y2, uint8* dst_sobelx, int width) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // src_y0 + mov esi, [esp + 8 + 8] // src_y1 + mov edi, [esp + 8 + 12] // src_y2 + mov edx, [esp + 8 + 16] // dst_sobelx + mov ecx, [esp + 8 + 20] // width + sub esi, eax + sub edi, eax + sub edx, eax + pxor xmm5, xmm5 // constant 0 + + align 4 + convertloop: + movq xmm0, qword ptr [eax] // read 8 pixels from src_y0[0] + movq xmm1, qword ptr [eax + 2] // read 8 pixels from src_y0[2] + punpcklbw xmm0, xmm5 + punpcklbw xmm1, xmm5 + psubw xmm0, xmm1 + movq xmm1, qword ptr [eax + esi] // read 8 pixels from src_y1[0] + movq xmm2, qword ptr [eax + esi + 2] // read 8 pixels from src_y1[2] + punpcklbw xmm1, xmm5 + punpcklbw xmm2, xmm5 + psubw xmm1, xmm2 + movq xmm2, qword ptr [eax + edi] // read 8 pixels from src_y2[0] + movq xmm3, qword ptr [eax + edi + 2] // read 8 pixels from src_y2[2] + punpcklbw xmm2, xmm5 + punpcklbw xmm3, xmm5 + psubw xmm2, xmm3 + paddw xmm0, xmm2 + paddw xmm0, xmm1 + paddw xmm0, xmm1 + pxor xmm1, xmm1 // abs = max(xmm0, -xmm0). SSSE3 could use pabsw + psubw xmm1, xmm0 + pmaxsw xmm0, xmm1 + packuswb xmm0, xmm0 + sub ecx, 8 + movq qword ptr [eax + edx], xmm0 + lea eax, [eax + 8] + jg convertloop + + pop edi + pop esi + ret + } +} +#endif // HAS_SOBELXROW_SSE2 + +#ifdef HAS_SOBELYROW_SSE2 +// SobelY as a matrix is +// -1 -2 -1 +// 0 0 0 +// 1 2 1 +__declspec(naked) __declspec(align(16)) +void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1, + uint8* dst_sobely, int width) { + __asm { + push esi + mov eax, [esp + 4 + 4] // src_y0 + mov esi, [esp + 4 + 8] // src_y1 + mov edx, [esp + 4 + 12] // dst_sobely + mov ecx, [esp + 4 + 16] // width + sub esi, eax + sub edx, eax + pxor xmm5, xmm5 // constant 0 + + align 4 + convertloop: + movq xmm0, qword ptr [eax] // read 8 pixels from src_y0[0] + movq xmm1, qword ptr [eax + esi] // read 8 pixels from src_y1[0] + punpcklbw xmm0, xmm5 + punpcklbw xmm1, xmm5 + psubw xmm0, xmm1 + movq xmm1, qword ptr [eax + 1] // read 8 pixels from src_y0[1] + movq xmm2, qword ptr [eax + esi + 1] // read 8 pixels from src_y1[1] + punpcklbw xmm1, xmm5 + punpcklbw xmm2, xmm5 + psubw xmm1, xmm2 + movq xmm2, qword ptr [eax + 2] // read 8 pixels from src_y0[2] + movq xmm3, qword ptr [eax + esi + 2] // read 8 pixels from src_y1[2] + punpcklbw xmm2, xmm5 + punpcklbw xmm3, xmm5 + psubw xmm2, xmm3 + paddw xmm0, xmm2 + paddw xmm0, xmm1 + paddw xmm0, xmm1 + pxor xmm1, xmm1 // abs = max(xmm0, -xmm0). SSSE3 could use pabsw + psubw xmm1, xmm0 + pmaxsw xmm0, xmm1 + packuswb xmm0, xmm0 + sub ecx, 8 + movq qword ptr [eax + edx], xmm0 + lea eax, [eax + 8] + jg convertloop + + pop esi + ret + } +} +#endif // HAS_SOBELYROW_SSE2 + +#ifdef HAS_SOBELROW_SSE2 +// Adds Sobel X and Sobel Y and stores Sobel into ARGB. +// A = 255 +// R = Sobel +// G = Sobel +// B = Sobel +__declspec(naked) __declspec(align(16)) +void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, + uint8* dst_argb, int width) { + __asm { + push esi + mov eax, [esp + 4 + 4] // src_sobelx + mov esi, [esp + 4 + 8] // src_sobely + mov edx, [esp + 4 + 12] // dst_argb + mov ecx, [esp + 4 + 16] // width + sub esi, eax + pcmpeqb xmm5, xmm5 // alpha 255 + pslld xmm5, 24 // 0xff000000 + + align 4 + convertloop: + movdqa xmm0, [eax] // read 16 pixels src_sobelx + movdqa xmm1, [eax + esi] // read 16 pixels src_sobely + lea eax, [eax + 16] + paddusb xmm0, xmm1 // sobel = sobelx + sobely + movdqa xmm2, xmm0 // GG + punpcklbw xmm2, xmm0 // First 8 + punpckhbw xmm0, xmm0 // Next 8 + movdqa xmm1, xmm2 // GGGG + punpcklwd xmm1, xmm2 // First 4 + punpckhwd xmm2, xmm2 // Next 4 + por xmm1, xmm5 // GGGA + por xmm2, xmm5 + movdqa xmm3, xmm0 // GGGG + punpcklwd xmm3, xmm0 // Next 4 + punpckhwd xmm0, xmm0 // Last 4 + por xmm3, xmm5 // GGGA + por xmm0, xmm5 + sub ecx, 16 + movdqa [edx], xmm1 + movdqa [edx + 16], xmm2 + movdqa [edx + 32], xmm3 + movdqa [edx + 48], xmm0 + lea edx, [edx + 64] + jg convertloop + + pop esi + ret + } +} +#endif // HAS_SOBELROW_SSE2 + +#ifdef HAS_SOBELTOPLANEROW_SSE2 +// Adds Sobel X and Sobel Y and stores Sobel into a plane. +__declspec(naked) __declspec(align(16)) +void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, + uint8* dst_y, int width) { + __asm { + push esi + mov eax, [esp + 4 + 4] // src_sobelx + mov esi, [esp + 4 + 8] // src_sobely + mov edx, [esp + 4 + 12] // dst_argb + mov ecx, [esp + 4 + 16] // width + sub esi, eax + + align 4 + convertloop: + movdqa xmm0, [eax] // read 16 pixels src_sobelx + movdqa xmm1, [eax + esi] // read 16 pixels src_sobely + lea eax, [eax + 16] + paddusb xmm0, xmm1 // sobel = sobelx + sobely + sub ecx, 16 + movdqa [edx], xmm0 + lea edx, [edx + 16] + jg convertloop + + pop esi + ret + } +} +#endif // HAS_SOBELTOPLANEROW_SSE2 + +#ifdef HAS_SOBELXYROW_SSE2 +// Mixes Sobel X, Sobel Y and Sobel into ARGB. +// A = 255 +// R = Sobel X +// G = Sobel +// B = Sobel Y +__declspec(naked) __declspec(align(16)) +void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, + uint8* dst_argb, int width) { + __asm { + push esi + mov eax, [esp + 4 + 4] // src_sobelx + mov esi, [esp + 4 + 8] // src_sobely + mov edx, [esp + 4 + 12] // dst_argb + mov ecx, [esp + 4 + 16] // width + sub esi, eax + pcmpeqb xmm5, xmm5 // alpha 255 + + align 4 + convertloop: + movdqa xmm0, [eax] // read 16 pixels src_sobelx + movdqa xmm1, [eax + esi] // read 16 pixels src_sobely + lea eax, [eax + 16] + movdqa xmm2, xmm0 + paddusb xmm2, xmm1 // sobel = sobelx + sobely + movdqa xmm3, xmm0 // XA + punpcklbw xmm3, xmm5 + punpckhbw xmm0, xmm5 + movdqa xmm4, xmm1 // YS + punpcklbw xmm4, xmm2 + punpckhbw xmm1, xmm2 + movdqa xmm6, xmm4 // YSXA + punpcklwd xmm6, xmm3 // First 4 + punpckhwd xmm4, xmm3 // Next 4 + movdqa xmm7, xmm1 // YSXA + punpcklwd xmm7, xmm0 // Next 4 + punpckhwd xmm1, xmm0 // Last 4 + sub ecx, 16 + movdqa [edx], xmm6 + movdqa [edx + 16], xmm4 + movdqa [edx + 32], xmm7 + movdqa [edx + 48], xmm1 + lea edx, [edx + 64] + jg convertloop + + pop esi + ret + } +} +#endif // HAS_SOBELXYROW_SSE2 + +#ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2 +// Consider float CumulativeSum. +// Consider calling CumulativeSum one row at time as needed. +// Consider circular CumulativeSum buffer of radius * 2 + 1 height. +// Convert cumulative sum for an area to an average for 1 pixel. +// topleft is pointer to top left of CumulativeSum buffer for area. +// botleft is pointer to bottom left of CumulativeSum buffer. +// width is offset from left to right of area in CumulativeSum buffer measured +// in number of ints. +// area is the number of pixels in the area being averaged. +// dst points to pixel to store result to. +// count is number of averaged pixels to produce. +// Does 4 pixels at a time, requires CumulativeSum pointers to be 16 byte +// aligned. +void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft, + int width, int area, uint8* dst, + int count) { + __asm { + mov eax, topleft // eax topleft + mov esi, botleft // esi botleft + mov edx, width + movd xmm5, area + mov edi, dst + mov ecx, count + cvtdq2ps xmm5, xmm5 + rcpss xmm4, xmm5 // 1.0f / area + pshufd xmm4, xmm4, 0 + sub ecx, 4 + jl l4b + + cmp area, 128 // 128 pixels will not overflow 15 bits. + ja l4 + + pshufd xmm5, xmm5, 0 // area + pcmpeqb xmm6, xmm6 // constant of 65536.0 - 1 = 65535.0 + psrld xmm6, 16 + cvtdq2ps xmm6, xmm6 + addps xmm5, xmm6 // (65536.0 + area - 1) + mulps xmm5, xmm4 // (65536.0 + area - 1) * 1 / area + cvtps2dq xmm5, xmm5 // 0.16 fixed point + packssdw xmm5, xmm5 // 16 bit shorts + + // 4 pixel loop small blocks. + align 4 + s4: + // top left + movdqa xmm0, [eax] + movdqa xmm1, [eax + 16] + movdqa xmm2, [eax + 32] + movdqa xmm3, [eax + 48] + + // - top right + psubd xmm0, [eax + edx * 4] + psubd xmm1, [eax + edx * 4 + 16] + psubd xmm2, [eax + edx * 4 + 32] + psubd xmm3, [eax + edx * 4 + 48] + lea eax, [eax + 64] + + // - bottom left + psubd xmm0, [esi] + psubd xmm1, [esi + 16] + psubd xmm2, [esi + 32] + psubd xmm3, [esi + 48] + + // + bottom right + paddd xmm0, [esi + edx * 4] + paddd xmm1, [esi + edx * 4 + 16] + paddd xmm2, [esi + edx * 4 + 32] + paddd xmm3, [esi + edx * 4 + 48] + lea esi, [esi + 64] + + packssdw xmm0, xmm1 // pack 4 pixels into 2 registers + packssdw xmm2, xmm3 + + pmulhuw xmm0, xmm5 + pmulhuw xmm2, xmm5 + + packuswb xmm0, xmm2 + movdqu [edi], xmm0 + lea edi, [edi + 16] + sub ecx, 4 + jge s4 + + jmp l4b + + // 4 pixel loop + align 4 + l4: + // top left + movdqa xmm0, [eax] + movdqa xmm1, [eax + 16] + movdqa xmm2, [eax + 32] + movdqa xmm3, [eax + 48] + + // - top right + psubd xmm0, [eax + edx * 4] + psubd xmm1, [eax + edx * 4 + 16] + psubd xmm2, [eax + edx * 4 + 32] + psubd xmm3, [eax + edx * 4 + 48] + lea eax, [eax + 64] + + // - bottom left + psubd xmm0, [esi] + psubd xmm1, [esi + 16] + psubd xmm2, [esi + 32] + psubd xmm3, [esi + 48] + + // + bottom right + paddd xmm0, [esi + edx * 4] + paddd xmm1, [esi + edx * 4 + 16] + paddd xmm2, [esi + edx * 4 + 32] + paddd xmm3, [esi + edx * 4 + 48] + lea esi, [esi + 64] + + cvtdq2ps xmm0, xmm0 // Average = Sum * 1 / Area + cvtdq2ps xmm1, xmm1 + mulps xmm0, xmm4 + mulps xmm1, xmm4 + cvtdq2ps xmm2, xmm2 + cvtdq2ps xmm3, xmm3 + mulps xmm2, xmm4 + mulps xmm3, xmm4 + cvtps2dq xmm0, xmm0 + cvtps2dq xmm1, xmm1 + cvtps2dq xmm2, xmm2 + cvtps2dq xmm3, xmm3 + packssdw xmm0, xmm1 + packssdw xmm2, xmm3 + packuswb xmm0, xmm2 + movdqu [edi], xmm0 + lea edi, [edi + 16] + sub ecx, 4 + jge l4 + + l4b: + add ecx, 4 - 1 + jl l1b + + // 1 pixel loop + align 4 + l1: + movdqa xmm0, [eax] + psubd xmm0, [eax + edx * 4] + lea eax, [eax + 16] + psubd xmm0, [esi] + paddd xmm0, [esi + edx * 4] + lea esi, [esi + 16] + cvtdq2ps xmm0, xmm0 + mulps xmm0, xmm4 + cvtps2dq xmm0, xmm0 + packssdw xmm0, xmm0 + packuswb xmm0, xmm0 + movd dword ptr [edi], xmm0 + lea edi, [edi + 4] + sub ecx, 1 + jge l1 + l1b: + } +} +#endif // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2 + +#ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2 +// Creates a table of cumulative sums where each value is a sum of all values +// above and to the left of the value. +void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum, + const int32* previous_cumsum, int width) { + __asm { + mov eax, row + mov edx, cumsum + mov esi, previous_cumsum + mov ecx, width + pxor xmm0, xmm0 + pxor xmm1, xmm1 + + sub ecx, 4 + jl l4b + test edx, 15 + jne l4b + + // 4 pixel loop + align 4 + l4: + movdqu xmm2, [eax] // 4 argb pixels 16 bytes. + lea eax, [eax + 16] + movdqa xmm4, xmm2 + + punpcklbw xmm2, xmm1 + movdqa xmm3, xmm2 + punpcklwd xmm2, xmm1 + punpckhwd xmm3, xmm1 + + punpckhbw xmm4, xmm1 + movdqa xmm5, xmm4 + punpcklwd xmm4, xmm1 + punpckhwd xmm5, xmm1 + + paddd xmm0, xmm2 + movdqa xmm2, [esi] // previous row above. + paddd xmm2, xmm0 + + paddd xmm0, xmm3 + movdqa xmm3, [esi + 16] + paddd xmm3, xmm0 + + paddd xmm0, xmm4 + movdqa xmm4, [esi + 32] + paddd xmm4, xmm0 + + paddd xmm0, xmm5 + movdqa xmm5, [esi + 48] + lea esi, [esi + 64] + paddd xmm5, xmm0 + + movdqa [edx], xmm2 + movdqa [edx + 16], xmm3 + movdqa [edx + 32], xmm4 + movdqa [edx + 48], xmm5 + + lea edx, [edx + 64] + sub ecx, 4 + jge l4 + + l4b: + add ecx, 4 - 1 + jl l1b + + // 1 pixel loop + align 4 + l1: + movd xmm2, dword ptr [eax] // 1 argb pixel 4 bytes. + lea eax, [eax + 4] + punpcklbw xmm2, xmm1 + punpcklwd xmm2, xmm1 + paddd xmm0, xmm2 + movdqu xmm2, [esi] + lea esi, [esi + 16] + paddd xmm2, xmm0 + movdqu [edx], xmm2 + lea edx, [edx + 16] + sub ecx, 1 + jge l1 + + l1b: + } +} +#endif // HAS_COMPUTECUMULATIVESUMROW_SSE2 + +#ifdef HAS_ARGBAFFINEROW_SSE2 +// Copy ARGB pixels from source image with slope to a row of destination. +__declspec(naked) __declspec(align(16)) +LIBYUV_API +void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride, + uint8* dst_argb, const float* uv_dudv, int width) { + __asm { + push esi + push edi + mov eax, [esp + 12] // src_argb + mov esi, [esp + 16] // stride + mov edx, [esp + 20] // dst_argb + mov ecx, [esp + 24] // pointer to uv_dudv + movq xmm2, qword ptr [ecx] // uv + movq xmm7, qword ptr [ecx + 8] // dudv + mov ecx, [esp + 28] // width + shl esi, 16 // 4, stride + add esi, 4 + movd xmm5, esi + sub ecx, 4 + jl l4b + + // setup for 4 pixel loop + pshufd xmm7, xmm7, 0x44 // dup dudv + pshufd xmm5, xmm5, 0 // dup 4, stride + movdqa xmm0, xmm2 // x0, y0, x1, y1 + addps xmm0, xmm7 + movlhps xmm2, xmm0 + movdqa xmm4, xmm7 + addps xmm4, xmm4 // dudv *= 2 + movdqa xmm3, xmm2 // x2, y2, x3, y3 + addps xmm3, xmm4 + addps xmm4, xmm4 // dudv *= 4 + + // 4 pixel loop + align 4 + l4: + cvttps2dq xmm0, xmm2 // x, y float to int first 2 + cvttps2dq xmm1, xmm3 // x, y float to int next 2 + packssdw xmm0, xmm1 // x, y as 8 shorts + pmaddwd xmm0, xmm5 // offsets = x * 4 + y * stride. + movd esi, xmm0 + pshufd xmm0, xmm0, 0x39 // shift right + movd edi, xmm0 + pshufd xmm0, xmm0, 0x39 // shift right + movd xmm1, [eax + esi] // read pixel 0 + movd xmm6, [eax + edi] // read pixel 1 + punpckldq xmm1, xmm6 // combine pixel 0 and 1 + addps xmm2, xmm4 // x, y += dx, dy first 2 + movq qword ptr [edx], xmm1 + movd esi, xmm0 + pshufd xmm0, xmm0, 0x39 // shift right + movd edi, xmm0 + movd xmm6, [eax + esi] // read pixel 2 + movd xmm0, [eax + edi] // read pixel 3 + punpckldq xmm6, xmm0 // combine pixel 2 and 3 + addps xmm3, xmm4 // x, y += dx, dy next 2 + sub ecx, 4 + movq qword ptr 8[edx], xmm6 + lea edx, [edx + 16] + jge l4 + + l4b: + add ecx, 4 - 1 + jl l1b + + // 1 pixel loop + align 4 + l1: + cvttps2dq xmm0, xmm2 // x, y float to int + packssdw xmm0, xmm0 // x, y as shorts + pmaddwd xmm0, xmm5 // offset = x * 4 + y * stride + addps xmm2, xmm7 // x, y += dx, dy + movd esi, xmm0 + movd xmm0, [eax + esi] // copy a pixel + sub ecx, 1 + movd [edx], xmm0 + lea edx, [edx + 4] + jge l1 + l1b: + pop edi + pop esi + ret + } +} +#endif // HAS_ARGBAFFINEROW_SSE2 + +#ifdef HAS_INTERPOLATEROW_AVX2 +// Bilinear filter 16x2 -> 16x1 +__declspec(naked) __declspec(align(16)) +void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr, + ptrdiff_t src_stride, int dst_width, + int source_y_fraction) { + __asm { + push esi + push edi + mov edi, [esp + 8 + 4] // dst_ptr + mov esi, [esp + 8 + 8] // src_ptr + mov edx, [esp + 8 + 12] // src_stride + mov ecx, [esp + 8 + 16] // dst_width + mov eax, [esp + 8 + 20] // source_y_fraction (0..255) + shr eax, 1 + // Dispatch to specialized filters if applicable. + cmp eax, 0 + je xloop100 // 0 / 128. Blend 100 / 0. + sub edi, esi + cmp eax, 32 + je xloop75 // 32 / 128 is 0.25. Blend 75 / 25. + cmp eax, 64 + je xloop50 // 64 / 128 is 0.50. Blend 50 / 50. + cmp eax, 96 + je xloop25 // 96 / 128 is 0.75. Blend 25 / 75. + + vmovd xmm0, eax // high fraction 0..127 + neg eax + add eax, 128 + vmovd xmm5, eax // low fraction 128..1 + vpunpcklbw xmm5, xmm5, xmm0 + vpunpcklwd xmm5, xmm5, xmm5 + vpxor ymm0, ymm0, ymm0 + vpermd ymm5, ymm0, ymm5 + + align 4 + xloop: + vmovdqu ymm0, [esi] + vmovdqu ymm2, [esi + edx] + vpunpckhbw ymm1, ymm0, ymm2 // mutates + vpunpcklbw ymm0, ymm0, ymm2 // mutates + vpmaddubsw ymm0, ymm0, ymm5 + vpmaddubsw ymm1, ymm1, ymm5 + vpsrlw ymm0, ymm0, 7 + vpsrlw ymm1, ymm1, 7 + vpackuswb ymm0, ymm0, ymm1 // unmutates + sub ecx, 32 + vmovdqu [esi + edi], ymm0 + lea esi, [esi + 32] + jg xloop + jmp xloop99 + + // Blend 25 / 75. + align 4 + xloop25: + vmovdqu ymm0, [esi] + vpavgb ymm0, ymm0, [esi + edx] + vpavgb ymm0, ymm0, [esi + edx] + sub ecx, 32 + vmovdqu [esi + edi], ymm0 + lea esi, [esi + 32] + jg xloop25 + jmp xloop99 + + // Blend 50 / 50. + align 4 + xloop50: + vmovdqu ymm0, [esi] + vpavgb ymm0, ymm0, [esi + edx] + sub ecx, 32 + vmovdqu [esi + edi], ymm0 + lea esi, [esi + 32] + jg xloop50 + jmp xloop99 + + // Blend 75 / 25. + align 4 + xloop75: + vmovdqu ymm0, [esi + edx] + vpavgb ymm0, ymm0, [esi] + vpavgb ymm0, ymm0, [esi] + sub ecx, 32 + vmovdqu [esi + edi], ymm0 + lea esi, [esi + 32] + jg xloop75 + jmp xloop99 + + // Blend 100 / 0 - Copy row unchanged. + align 4 + xloop100: + rep movsb + + xloop99: + pop edi + pop esi + vzeroupper + ret + } +} +#endif // HAS_INTERPOLATEROW_AVX2 + +#ifdef HAS_INTERPOLATEROW_SSSE3 +// Bilinear filter 16x2 -> 16x1 +__declspec(naked) __declspec(align(16)) +void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr, + ptrdiff_t src_stride, int dst_width, + int source_y_fraction) { + __asm { + push esi + push edi + mov edi, [esp + 8 + 4] // dst_ptr + mov esi, [esp + 8 + 8] // src_ptr + mov edx, [esp + 8 + 12] // src_stride + mov ecx, [esp + 8 + 16] // dst_width + mov eax, [esp + 8 + 20] // source_y_fraction (0..255) + sub edi, esi + shr eax, 1 + // Dispatch to specialized filters if applicable. + cmp eax, 0 + je xloop100 // 0 / 128. Blend 100 / 0. + cmp eax, 32 + je xloop75 // 32 / 128 is 0.25. Blend 75 / 25. + cmp eax, 64 + je xloop50 // 64 / 128 is 0.50. Blend 50 / 50. + cmp eax, 96 + je xloop25 // 96 / 128 is 0.75. Blend 25 / 75. + + movd xmm0, eax // high fraction 0..127 + neg eax + add eax, 128 + movd xmm5, eax // low fraction 128..1 + punpcklbw xmm5, xmm0 + punpcklwd xmm5, xmm5 + pshufd xmm5, xmm5, 0 + + align 4 + xloop: + movdqa xmm0, [esi] + movdqa xmm2, [esi + edx] + movdqa xmm1, xmm0 + punpcklbw xmm0, xmm2 + punpckhbw xmm1, xmm2 + pmaddubsw xmm0, xmm5 + pmaddubsw xmm1, xmm5 + psrlw xmm0, 7 + psrlw xmm1, 7 + packuswb xmm0, xmm1 + sub ecx, 16 + movdqa [esi + edi], xmm0 + lea esi, [esi + 16] + jg xloop + jmp xloop99 + + // Blend 25 / 75. + align 4 + xloop25: + movdqa xmm0, [esi] + movdqa xmm1, [esi + edx] + pavgb xmm0, xmm1 + pavgb xmm0, xmm1 + sub ecx, 16 + movdqa [esi + edi], xmm0 + lea esi, [esi + 16] + jg xloop25 + jmp xloop99 + + // Blend 50 / 50. + align 4 + xloop50: + movdqa xmm0, [esi] + movdqa xmm1, [esi + edx] + pavgb xmm0, xmm1 + sub ecx, 16 + movdqa [esi + edi], xmm0 + lea esi, [esi + 16] + jg xloop50 + jmp xloop99 + + // Blend 75 / 25. + align 4 + xloop75: + movdqa xmm1, [esi] + movdqa xmm0, [esi + edx] + pavgb xmm0, xmm1 + pavgb xmm0, xmm1 + sub ecx, 16 + movdqa [esi + edi], xmm0 + lea esi, [esi + 16] + jg xloop75 + jmp xloop99 + + // Blend 100 / 0 - Copy row unchanged. + align 4 + xloop100: + movdqa xmm0, [esi] + sub ecx, 16 + movdqa [esi + edi], xmm0 + lea esi, [esi + 16] + jg xloop100 + + xloop99: + pop edi + pop esi + ret + } +} +#endif // HAS_INTERPOLATEROW_SSSE3 + +#ifdef HAS_INTERPOLATEROW_SSE2 +// Bilinear filter 16x2 -> 16x1 +__declspec(naked) __declspec(align(16)) +void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr, + ptrdiff_t src_stride, int dst_width, + int source_y_fraction) { + __asm { + push esi + push edi + mov edi, [esp + 8 + 4] // dst_ptr + mov esi, [esp + 8 + 8] // src_ptr + mov edx, [esp + 8 + 12] // src_stride + mov ecx, [esp + 8 + 16] // dst_width + mov eax, [esp + 8 + 20] // source_y_fraction (0..255) + sub edi, esi + // Dispatch to specialized filters if applicable. + cmp eax, 0 + je xloop100 // 0 / 256. Blend 100 / 0. + cmp eax, 64 + je xloop75 // 64 / 256 is 0.25. Blend 75 / 25. + cmp eax, 128 + je xloop50 // 128 / 256 is 0.50. Blend 50 / 50. + cmp eax, 192 + je xloop25 // 192 / 256 is 0.75. Blend 25 / 75. + + movd xmm5, eax // xmm5 = y fraction + punpcklbw xmm5, xmm5 + psrlw xmm5, 1 + punpcklwd xmm5, xmm5 + punpckldq xmm5, xmm5 + punpcklqdq xmm5, xmm5 + pxor xmm4, xmm4 + + align 4 + xloop: + movdqa xmm0, [esi] // row0 + movdqa xmm2, [esi + edx] // row1 + movdqa xmm1, xmm0 + movdqa xmm3, xmm2 + punpcklbw xmm2, xmm4 + punpckhbw xmm3, xmm4 + punpcklbw xmm0, xmm4 + punpckhbw xmm1, xmm4 + psubw xmm2, xmm0 // row1 - row0 + psubw xmm3, xmm1 + paddw xmm2, xmm2 // 9 bits * 15 bits = 8.16 + paddw xmm3, xmm3 + pmulhw xmm2, xmm5 // scale diff + pmulhw xmm3, xmm5 + paddw xmm0, xmm2 // sum rows + paddw xmm1, xmm3 + packuswb xmm0, xmm1 + sub ecx, 16 + movdqa [esi + edi], xmm0 + lea esi, [esi + 16] + jg xloop + jmp xloop99 + + // Blend 25 / 75. + align 4 + xloop25: + movdqa xmm0, [esi] + movdqa xmm1, [esi + edx] + pavgb xmm0, xmm1 + pavgb xmm0, xmm1 + sub ecx, 16 + movdqa [esi + edi], xmm0 + lea esi, [esi + 16] + jg xloop25 + jmp xloop99 + + // Blend 50 / 50. + align 4 + xloop50: + movdqa xmm0, [esi] + movdqa xmm1, [esi + edx] + pavgb xmm0, xmm1 + sub ecx, 16 + movdqa [esi + edi], xmm0 + lea esi, [esi + 16] + jg xloop50 + jmp xloop99 + + // Blend 75 / 25. + align 4 + xloop75: + movdqa xmm1, [esi] + movdqa xmm0, [esi + edx] + pavgb xmm0, xmm1 + pavgb xmm0, xmm1 + sub ecx, 16 + movdqa [esi + edi], xmm0 + lea esi, [esi + 16] + jg xloop75 + jmp xloop99 + + // Blend 100 / 0 - Copy row unchanged. + align 4 + xloop100: + movdqa xmm0, [esi] + sub ecx, 16 + movdqa [esi + edi], xmm0 + lea esi, [esi + 16] + jg xloop100 + + xloop99: + pop edi + pop esi + ret + } +} +#endif // HAS_INTERPOLATEROW_SSE2 + +// Bilinear filter 16x2 -> 16x1 +__declspec(naked) __declspec(align(16)) +void InterpolateRow_Unaligned_SSSE3(uint8* dst_ptr, const uint8* src_ptr, + ptrdiff_t src_stride, int dst_width, + int source_y_fraction) { + __asm { + push esi + push edi + mov edi, [esp + 8 + 4] // dst_ptr + mov esi, [esp + 8 + 8] // src_ptr + mov edx, [esp + 8 + 12] // src_stride + mov ecx, [esp + 8 + 16] // dst_width + mov eax, [esp + 8 + 20] // source_y_fraction (0..255) + sub edi, esi + shr eax, 1 + // Dispatch to specialized filters if applicable. + cmp eax, 0 + je xloop100 // 0 / 128. Blend 100 / 0. + cmp eax, 32 + je xloop75 // 32 / 128 is 0.25. Blend 75 / 25. + cmp eax, 64 + je xloop50 // 64 / 128 is 0.50. Blend 50 / 50. + cmp eax, 96 + je xloop25 // 96 / 128 is 0.75. Blend 25 / 75. + + movd xmm0, eax // high fraction 0..127 + neg eax + add eax, 128 + movd xmm5, eax // low fraction 128..1 + punpcklbw xmm5, xmm0 + punpcklwd xmm5, xmm5 + pshufd xmm5, xmm5, 0 + + align 4 + xloop: + movdqu xmm0, [esi] + movdqu xmm2, [esi + edx] + movdqu xmm1, xmm0 + punpcklbw xmm0, xmm2 + punpckhbw xmm1, xmm2 + pmaddubsw xmm0, xmm5 + pmaddubsw xmm1, xmm5 + psrlw xmm0, 7 + psrlw xmm1, 7 + packuswb xmm0, xmm1 + sub ecx, 16 + movdqu [esi + edi], xmm0 + lea esi, [esi + 16] + jg xloop + jmp xloop99 + + // Blend 25 / 75. + align 4 + xloop25: + movdqu xmm0, [esi] + movdqu xmm1, [esi + edx] + pavgb xmm0, xmm1 + pavgb xmm0, xmm1 + sub ecx, 16 + movdqu [esi + edi], xmm0 + lea esi, [esi + 16] + jg xloop25 + jmp xloop99 + + // Blend 50 / 50. + align 4 + xloop50: + movdqu xmm0, [esi] + movdqu xmm1, [esi + edx] + pavgb xmm0, xmm1 + sub ecx, 16 + movdqu [esi + edi], xmm0 + lea esi, [esi + 16] + jg xloop50 + jmp xloop99 + + // Blend 75 / 25. + align 4 + xloop75: + movdqu xmm1, [esi] + movdqu xmm0, [esi + edx] + pavgb xmm0, xmm1 + pavgb xmm0, xmm1 + sub ecx, 16 + movdqu [esi + edi], xmm0 + lea esi, [esi + 16] + jg xloop75 + jmp xloop99 + + // Blend 100 / 0 - Copy row unchanged. + align 4 + xloop100: + movdqu xmm0, [esi] + sub ecx, 16 + movdqu [esi + edi], xmm0 + lea esi, [esi + 16] + jg xloop100 + + xloop99: + pop edi + pop esi + ret + } +} + +#ifdef HAS_INTERPOLATEROW_SSE2 +// Bilinear filter 16x2 -> 16x1 +__declspec(naked) __declspec(align(16)) +void InterpolateRow_Unaligned_SSE2(uint8* dst_ptr, const uint8* src_ptr, + ptrdiff_t src_stride, int dst_width, + int source_y_fraction) { + __asm { + push esi + push edi + mov edi, [esp + 8 + 4] // dst_ptr + mov esi, [esp + 8 + 8] // src_ptr + mov edx, [esp + 8 + 12] // src_stride + mov ecx, [esp + 8 + 16] // dst_width + mov eax, [esp + 8 + 20] // source_y_fraction (0..255) + sub edi, esi + // Dispatch to specialized filters if applicable. + cmp eax, 0 + je xloop100 // 0 / 256. Blend 100 / 0. + cmp eax, 64 + je xloop75 // 64 / 256 is 0.25. Blend 75 / 25. + cmp eax, 128 + je xloop50 // 128 / 256 is 0.50. Blend 50 / 50. + cmp eax, 192 + je xloop25 // 192 / 256 is 0.75. Blend 25 / 75. + + movd xmm5, eax // xmm5 = y fraction + punpcklbw xmm5, xmm5 + psrlw xmm5, 1 + punpcklwd xmm5, xmm5 + punpckldq xmm5, xmm5 + punpcklqdq xmm5, xmm5 + pxor xmm4, xmm4 + + align 4 + xloop: + movdqu xmm0, [esi] // row0 + movdqu xmm2, [esi + edx] // row1 + movdqu xmm1, xmm0 + movdqu xmm3, xmm2 + punpcklbw xmm2, xmm4 + punpckhbw xmm3, xmm4 + punpcklbw xmm0, xmm4 + punpckhbw xmm1, xmm4 + psubw xmm2, xmm0 // row1 - row0 + psubw xmm3, xmm1 + paddw xmm2, xmm2 // 9 bits * 15 bits = 8.16 + paddw xmm3, xmm3 + pmulhw xmm2, xmm5 // scale diff + pmulhw xmm3, xmm5 + paddw xmm0, xmm2 // sum rows + paddw xmm1, xmm3 + packuswb xmm0, xmm1 + sub ecx, 16 + movdqu [esi + edi], xmm0 + lea esi, [esi + 16] + jg xloop + jmp xloop99 + + // Blend 25 / 75. + align 4 + xloop25: + movdqu xmm0, [esi] + movdqu xmm1, [esi + edx] + pavgb xmm0, xmm1 + pavgb xmm0, xmm1 + sub ecx, 16 + movdqu [esi + edi], xmm0 + lea esi, [esi + 16] + jg xloop25 + jmp xloop99 + + // Blend 50 / 50. + align 4 + xloop50: + movdqu xmm0, [esi] + movdqu xmm1, [esi + edx] + pavgb xmm0, xmm1 + sub ecx, 16 + movdqu [esi + edi], xmm0 + lea esi, [esi + 16] + jg xloop50 + jmp xloop99 + + // Blend 75 / 25. + align 4 + xloop75: + movdqu xmm1, [esi] + movdqu xmm0, [esi + edx] + pavgb xmm0, xmm1 + pavgb xmm0, xmm1 + sub ecx, 16 + movdqu [esi + edi], xmm0 + lea esi, [esi + 16] + jg xloop75 + jmp xloop99 + + // Blend 100 / 0 - Copy row unchanged. + align 4 + xloop100: + movdqu xmm0, [esi] + sub ecx, 16 + movdqu [esi + edi], xmm0 + lea esi, [esi + 16] + jg xloop100 + + xloop99: + pop edi + pop esi + ret + } +} +#endif // HAS_INTERPOLATEROW_SSE2 + +__declspec(naked) __declspec(align(16)) +void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride, + uint8* dst_uv, int pix) { + __asm { + push edi + mov eax, [esp + 4 + 4] // src_uv + mov edx, [esp + 4 + 8] // src_uv_stride + mov edi, [esp + 4 + 12] // dst_v + mov ecx, [esp + 4 + 16] // pix + sub edi, eax + + align 4 + convertloop: + movdqa xmm0, [eax] + pavgb xmm0, [eax + edx] + sub ecx, 16 + movdqa [eax + edi], xmm0 + lea eax, [eax + 16] + jg convertloop + pop edi + ret + } +} + +#ifdef HAS_HALFROW_AVX2 +__declspec(naked) __declspec(align(16)) +void HalfRow_AVX2(const uint8* src_uv, int src_uv_stride, + uint8* dst_uv, int pix) { + __asm { + push edi + mov eax, [esp + 4 + 4] // src_uv + mov edx, [esp + 4 + 8] // src_uv_stride + mov edi, [esp + 4 + 12] // dst_v + mov ecx, [esp + 4 + 16] // pix + sub edi, eax + + align 4 + convertloop: + vmovdqu ymm0, [eax] + vpavgb ymm0, ymm0, [eax + edx] + sub ecx, 32 + vmovdqu [eax + edi], ymm0 + lea eax, [eax + 32] + jg convertloop + + pop edi + vzeroupper + ret + } +} +#endif // HAS_HALFROW_AVX2 + +__declspec(naked) __declspec(align(16)) +void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer, + uint32 selector, int pix) { + __asm { + mov eax, [esp + 4] // src_argb + mov edx, [esp + 8] // dst_bayer + movd xmm5, [esp + 12] // selector + mov ecx, [esp + 16] // pix + pshufd xmm5, xmm5, 0 + + align 4 + wloop: + movdqa xmm0, [eax] + movdqa xmm1, [eax + 16] + lea eax, [eax + 32] + pshufb xmm0, xmm5 + pshufb xmm1, xmm5 + punpckldq xmm0, xmm1 + sub ecx, 8 + movq qword ptr [edx], xmm0 + lea edx, [edx + 8] + jg wloop + ret + } +} + +// Specialized ARGB to Bayer that just isolates G channel. +__declspec(naked) __declspec(align(16)) +void ARGBToBayerGGRow_SSE2(const uint8* src_argb, uint8* dst_bayer, + uint32 selector, int pix) { + __asm { + mov eax, [esp + 4] // src_argb + mov edx, [esp + 8] // dst_bayer + // selector + mov ecx, [esp + 16] // pix + pcmpeqb xmm5, xmm5 // generate mask 0x000000ff + psrld xmm5, 24 + + align 4 + wloop: + movdqa xmm0, [eax] + movdqa xmm1, [eax + 16] + lea eax, [eax + 32] + psrld xmm0, 8 // Move green to bottom. + psrld xmm1, 8 + pand xmm0, xmm5 + pand xmm1, xmm5 + packssdw xmm0, xmm1 + packuswb xmm0, xmm1 + sub ecx, 8 + movq qword ptr [edx], xmm0 + lea edx, [edx + 8] + jg wloop + ret + } +} + +// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. +__declspec(naked) __declspec(align(16)) +void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb, + const uint8* shuffler, int pix) { + __asm { + mov eax, [esp + 4] // src_argb + mov edx, [esp + 8] // dst_argb + mov ecx, [esp + 12] // shuffler + movdqa xmm5, [ecx] + mov ecx, [esp + 16] // pix + + align 4 + wloop: + movdqa xmm0, [eax] + movdqa xmm1, [eax + 16] + lea eax, [eax + 32] + pshufb xmm0, xmm5 + pshufb xmm1, xmm5 + sub ecx, 8 + movdqa [edx], xmm0 + movdqa [edx + 16], xmm1 + lea edx, [edx + 32] + jg wloop + ret + } +} + +__declspec(naked) __declspec(align(16)) +void ARGBShuffleRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_argb, + const uint8* shuffler, int pix) { + __asm { + mov eax, [esp + 4] // src_argb + mov edx, [esp + 8] // dst_argb + mov ecx, [esp + 12] // shuffler + movdqa xmm5, [ecx] + mov ecx, [esp + 16] // pix + + align 4 + wloop: + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + lea eax, [eax + 32] + pshufb xmm0, xmm5 + pshufb xmm1, xmm5 + sub ecx, 8 + movdqu [edx], xmm0 + movdqu [edx + 16], xmm1 + lea edx, [edx + 32] + jg wloop + ret + } +} + +#ifdef HAS_ARGBSHUFFLEROW_AVX2 +__declspec(naked) __declspec(align(16)) +void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb, + const uint8* shuffler, int pix) { + __asm { + mov eax, [esp + 4] // src_argb + mov edx, [esp + 8] // dst_argb + mov ecx, [esp + 12] // shuffler + vbroadcastf128 ymm5, [ecx] // same shuffle in high as low. + mov ecx, [esp + 16] // pix + + align 4 + wloop: + vmovdqu ymm0, [eax] + vmovdqu ymm1, [eax + 32] + lea eax, [eax + 64] + vpshufb ymm0, ymm0, ymm5 + vpshufb ymm1, ymm1, ymm5 + sub ecx, 16 + vmovdqu [edx], ymm0 + vmovdqu [edx + 32], ymm1 + lea edx, [edx + 64] + jg wloop + + vzeroupper + ret + } +} +#endif // HAS_ARGBSHUFFLEROW_AVX2 + +__declspec(naked) __declspec(align(16)) +void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb, + const uint8* shuffler, int pix) { + __asm { + push ebx + push esi + mov eax, [esp + 8 + 4] // src_argb + mov edx, [esp + 8 + 8] // dst_argb + mov esi, [esp + 8 + 12] // shuffler + mov ecx, [esp + 8 + 16] // pix + pxor xmm5, xmm5 + + mov ebx, [esi] // shuffler + cmp ebx, 0x03000102 + je shuf_3012 + cmp ebx, 0x00010203 + je shuf_0123 + cmp ebx, 0x00030201 + je shuf_0321 + cmp ebx, 0x02010003 + je shuf_2103 + + // TODO(fbarchard): Use one source pointer and 3 offsets. + shuf_any1: + movzx ebx, byte ptr [esi] + movzx ebx, byte ptr [eax + ebx] + mov [edx], bl + movzx ebx, byte ptr [esi + 1] + movzx ebx, byte ptr [eax + ebx] + mov [edx + 1], bl + movzx ebx, byte ptr [esi + 2] + movzx ebx, byte ptr [eax + ebx] + mov [edx + 2], bl + movzx ebx, byte ptr [esi + 3] + movzx ebx, byte ptr [eax + ebx] + mov [edx + 3], bl + lea eax, [eax + 4] + lea edx, [edx + 4] + sub ecx, 1 + jg shuf_any1 + jmp shuf99 + + align 4 + shuf_0123: + movdqu xmm0, [eax] + lea eax, [eax + 16] + movdqa xmm1, xmm0 + punpcklbw xmm0, xmm5 + punpckhbw xmm1, xmm5 + pshufhw xmm0, xmm0, 01Bh // 1B = 00011011 = 0x0123 = BGRAToARGB + pshuflw xmm0, xmm0, 01Bh + pshufhw xmm1, xmm1, 01Bh + pshuflw xmm1, xmm1, 01Bh + packuswb xmm0, xmm1 + sub ecx, 4 + movdqu [edx], xmm0 + lea edx, [edx + 16] + jg shuf_0123 + jmp shuf99 + + align 4 + shuf_0321: + movdqu xmm0, [eax] + lea eax, [eax + 16] + movdqa xmm1, xmm0 + punpcklbw xmm0, xmm5 + punpckhbw xmm1, xmm5 + pshufhw xmm0, xmm0, 039h // 39 = 00111001 = 0x0321 = RGBAToARGB + pshuflw xmm0, xmm0, 039h + pshufhw xmm1, xmm1, 039h + pshuflw xmm1, xmm1, 039h + packuswb xmm0, xmm1 + sub ecx, 4 + movdqu [edx], xmm0 + lea edx, [edx + 16] + jg shuf_0321 + jmp shuf99 + + align 4 + shuf_2103: + movdqu xmm0, [eax] + lea eax, [eax + 16] + movdqa xmm1, xmm0 + punpcklbw xmm0, xmm5 + punpckhbw xmm1, xmm5 + pshufhw xmm0, xmm0, 093h // 93 = 10010011 = 0x2103 = ARGBToRGBA + pshuflw xmm0, xmm0, 093h + pshufhw xmm1, xmm1, 093h + pshuflw xmm1, xmm1, 093h + packuswb xmm0, xmm1 + sub ecx, 4 + movdqu [edx], xmm0 + lea edx, [edx + 16] + jg shuf_2103 + jmp shuf99 + + align 4 + shuf_3012: + movdqu xmm0, [eax] + lea eax, [eax + 16] + movdqa xmm1, xmm0 + punpcklbw xmm0, xmm5 + punpckhbw xmm1, xmm5 + pshufhw xmm0, xmm0, 0C6h // C6 = 11000110 = 0x3012 = ABGRToARGB + pshuflw xmm0, xmm0, 0C6h + pshufhw xmm1, xmm1, 0C6h + pshuflw xmm1, xmm1, 0C6h + packuswb xmm0, xmm1 + sub ecx, 4 + movdqu [edx], xmm0 + lea edx, [edx + 16] + jg shuf_3012 + + shuf99: + pop esi + pop ebx + ret + } +} + +// YUY2 - Macro-pixel = 2 image pixels +// Y0U0Y1V0....Y2U2Y3V2...Y4U4Y5V4.... + +// UYVY - Macro-pixel = 2 image pixels +// U0Y0V0Y1 + +__declspec(naked) __declspec(align(16)) +void I422ToYUY2Row_SSE2(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_frame, int width) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // src_y + mov esi, [esp + 8 + 8] // src_u + mov edx, [esp + 8 + 12] // src_v + mov edi, [esp + 8 + 16] // dst_frame + mov ecx, [esp + 8 + 20] // width + sub edx, esi + + align 4 + convertloop: + movq xmm2, qword ptr [esi] // U + movq xmm3, qword ptr [esi + edx] // V + lea esi, [esi + 8] + punpcklbw xmm2, xmm3 // UV + movdqu xmm0, [eax] // Y + lea eax, [eax + 16] + movdqa xmm1, xmm0 + punpcklbw xmm0, xmm2 // YUYV + punpckhbw xmm1, xmm2 + movdqu [edi], xmm0 + movdqu [edi + 16], xmm1 + lea edi, [edi + 32] + sub ecx, 16 + jg convertloop + + pop edi + pop esi + ret + } +} + +__declspec(naked) __declspec(align(16)) +void I422ToUYVYRow_SSE2(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_frame, int width) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // src_y + mov esi, [esp + 8 + 8] // src_u + mov edx, [esp + 8 + 12] // src_v + mov edi, [esp + 8 + 16] // dst_frame + mov ecx, [esp + 8 + 20] // width + sub edx, esi + + align 4 + convertloop: + movq xmm2, qword ptr [esi] // U + movq xmm3, qword ptr [esi + edx] // V + lea esi, [esi + 8] + punpcklbw xmm2, xmm3 // UV + movdqu xmm0, [eax] // Y + movdqa xmm1, xmm2 + lea eax, [eax + 16] + punpcklbw xmm1, xmm0 // UYVY + punpckhbw xmm2, xmm0 + movdqu [edi], xmm1 + movdqu [edi + 16], xmm2 + lea edi, [edi + 32] + sub ecx, 16 + jg convertloop + + pop edi + pop esi + ret + } +} + +#ifdef HAS_ARGBPOLYNOMIALROW_SSE2 +__declspec(naked) __declspec(align(16)) +void ARGBPolynomialRow_SSE2(const uint8* src_argb, + uint8* dst_argb, const float* poly, + int width) { + __asm { + push esi + mov eax, [esp + 4 + 4] /* src_argb */ + mov edx, [esp + 4 + 8] /* dst_argb */ + mov esi, [esp + 4 + 12] /* poly */ + mov ecx, [esp + 4 + 16] /* width */ + pxor xmm3, xmm3 // 0 constant for zero extending bytes to ints. + + // 2 pixel loop. + align 4 + convertloop: +// pmovzxbd xmm0, dword ptr [eax] // BGRA pixel +// pmovzxbd xmm4, dword ptr [eax + 4] // BGRA pixel + movq xmm0, qword ptr [eax] // BGRABGRA + lea eax, [eax + 8] + punpcklbw xmm0, xmm3 + movdqa xmm4, xmm0 + punpcklwd xmm0, xmm3 // pixel 0 + punpckhwd xmm4, xmm3 // pixel 1 + cvtdq2ps xmm0, xmm0 // 4 floats + cvtdq2ps xmm4, xmm4 + movdqa xmm1, xmm0 // X + movdqa xmm5, xmm4 + mulps xmm0, [esi + 16] // C1 * X + mulps xmm4, [esi + 16] + addps xmm0, [esi] // result = C0 + C1 * X + addps xmm4, [esi] + movdqa xmm2, xmm1 + movdqa xmm6, xmm5 + mulps xmm2, xmm1 // X * X + mulps xmm6, xmm5 + mulps xmm1, xmm2 // X * X * X + mulps xmm5, xmm6 + mulps xmm2, [esi + 32] // C2 * X * X + mulps xmm6, [esi + 32] + mulps xmm1, [esi + 48] // C3 * X * X * X + mulps xmm5, [esi + 48] + addps xmm0, xmm2 // result += C2 * X * X + addps xmm4, xmm6 + addps xmm0, xmm1 // result += C3 * X * X * X + addps xmm4, xmm5 + cvttps2dq xmm0, xmm0 + cvttps2dq xmm4, xmm4 + packuswb xmm0, xmm4 + packuswb xmm0, xmm0 + sub ecx, 2 + movq qword ptr [edx], xmm0 + lea edx, [edx + 8] + jg convertloop + pop esi + ret + } +} +#endif // HAS_ARGBPOLYNOMIALROW_SSE2 + +#ifdef HAS_ARGBPOLYNOMIALROW_AVX2 +__declspec(naked) __declspec(align(16)) +void ARGBPolynomialRow_AVX2(const uint8* src_argb, + uint8* dst_argb, const float* poly, + int width) { + __asm { + mov eax, [esp + 4] /* src_argb */ + mov edx, [esp + 8] /* dst_argb */ + mov ecx, [esp + 12] /* poly */ + vbroadcastf128 ymm4, [ecx] // C0 + vbroadcastf128 ymm5, [ecx + 16] // C1 + vbroadcastf128 ymm6, [ecx + 32] // C2 + vbroadcastf128 ymm7, [ecx + 48] // C3 + mov ecx, [esp + 16] /* width */ + + // 2 pixel loop. + align 4 + convertloop: + vpmovzxbd ymm0, qword ptr [eax] // 2 BGRA pixels + lea eax, [eax + 8] + vcvtdq2ps ymm0, ymm0 // X 8 floats + vmulps ymm2, ymm0, ymm0 // X * X + vmulps ymm3, ymm0, ymm7 // C3 * X + vfmadd132ps ymm0, ymm4, ymm5 // result = C0 + C1 * X + vfmadd231ps ymm0, ymm2, ymm6 // result += C2 * X * X + vfmadd231ps ymm0, ymm2, ymm3 // result += C3 * X * X * X + vcvttps2dq ymm0, ymm0 + vpackusdw ymm0, ymm0, ymm0 // b0g0r0a0_00000000_b0g0r0a0_00000000 + vpermq ymm0, ymm0, 0xd8 // b0g0r0a0_b0g0r0a0_00000000_00000000 + vpackuswb xmm0, xmm0, xmm0 // bgrabgra_00000000_00000000_00000000 + sub ecx, 2 + vmovq qword ptr [edx], xmm0 + lea edx, [edx + 8] + jg convertloop + vzeroupper + ret + } +} +#endif // HAS_ARGBPOLYNOMIALROW_AVX2 + +#ifdef HAS_ARGBCOLORTABLEROW_X86 +// Tranform ARGB pixels with color table. +__declspec(naked) __declspec(align(16)) +void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, + int width) { + __asm { + push esi + mov eax, [esp + 4 + 4] /* dst_argb */ + mov esi, [esp + 4 + 8] /* table_argb */ + mov ecx, [esp + 4 + 12] /* width */ + + // 1 pixel loop. + align 4 + convertloop: + movzx edx, byte ptr [eax] + lea eax, [eax + 4] + movzx edx, byte ptr [esi + edx * 4] + mov byte ptr [eax - 4], dl + movzx edx, byte ptr [eax - 4 + 1] + movzx edx, byte ptr [esi + edx * 4 + 1] + mov byte ptr [eax - 4 + 1], dl + movzx edx, byte ptr [eax - 4 + 2] + movzx edx, byte ptr [esi + edx * 4 + 2] + mov byte ptr [eax - 4 + 2], dl + movzx edx, byte ptr [eax - 4 + 3] + movzx edx, byte ptr [esi + edx * 4 + 3] + mov byte ptr [eax - 4 + 3], dl + dec ecx + jg convertloop + pop esi + ret + } +} +#endif // HAS_ARGBCOLORTABLEROW_X86 + +#ifdef HAS_RGBCOLORTABLEROW_X86 +// Tranform RGB pixels with color table. +__declspec(naked) __declspec(align(16)) +void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) { + __asm { + push esi + mov eax, [esp + 4 + 4] /* dst_argb */ + mov esi, [esp + 4 + 8] /* table_argb */ + mov ecx, [esp + 4 + 12] /* width */ + + // 1 pixel loop. + align 4 + convertloop: + movzx edx, byte ptr [eax] + lea eax, [eax + 4] + movzx edx, byte ptr [esi + edx * 4] + mov byte ptr [eax - 4], dl + movzx edx, byte ptr [eax - 4 + 1] + movzx edx, byte ptr [esi + edx * 4 + 1] + mov byte ptr [eax - 4 + 1], dl + movzx edx, byte ptr [eax - 4 + 2] + movzx edx, byte ptr [esi + edx * 4 + 2] + mov byte ptr [eax - 4 + 2], dl + dec ecx + jg convertloop + + pop esi + ret + } +} +#endif // HAS_RGBCOLORTABLEROW_X86 + +#ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3 +// Tranform RGB pixels with luma table. +__declspec(naked) __declspec(align(16)) +void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb, + int width, + const uint8* luma, uint32 lumacoeff) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] /* src_argb */ + mov edi, [esp + 8 + 8] /* dst_argb */ + mov ecx, [esp + 8 + 12] /* width */ + movd xmm2, dword ptr [esp + 8 + 16] // luma table + movd xmm3, dword ptr [esp + 8 + 20] // lumacoeff + pshufd xmm2, xmm2, 0 + pshufd xmm3, xmm3, 0 + pcmpeqb xmm4, xmm4 // generate mask 0xff00ff00 + psllw xmm4, 8 + pxor xmm5, xmm5 + + // 4 pixel loop. + align 4 + convertloop: + movdqu xmm0, qword ptr [eax] // generate luma ptr + pmaddubsw xmm0, xmm3 + phaddw xmm0, xmm0 + pand xmm0, xmm4 // mask out low bits + punpcklwd xmm0, xmm5 + paddd xmm0, xmm2 // add table base + movd esi, xmm0 + pshufd xmm0, xmm0, 0x39 // 00111001 to rotate right 32 + + movzx edx, byte ptr [eax] + movzx edx, byte ptr [esi + edx] + mov byte ptr [edi], dl + movzx edx, byte ptr [eax + 1] + movzx edx, byte ptr [esi + edx] + mov byte ptr [edi + 1], dl + movzx edx, byte ptr [eax + 2] + movzx edx, byte ptr [esi + edx] + mov byte ptr [edi + 2], dl + movzx edx, byte ptr [eax + 3] // copy alpha. + mov byte ptr [edi + 3], dl + + movd esi, xmm0 + pshufd xmm0, xmm0, 0x39 // 00111001 to rotate right 32 + + movzx edx, byte ptr [eax + 4] + movzx edx, byte ptr [esi + edx] + mov byte ptr [edi + 4], dl + movzx edx, byte ptr [eax + 5] + movzx edx, byte ptr [esi + edx] + mov byte ptr [edi + 5], dl + movzx edx, byte ptr [eax + 6] + movzx edx, byte ptr [esi + edx] + mov byte ptr [edi + 6], dl + movzx edx, byte ptr [eax + 7] // copy alpha. + mov byte ptr [edi + 7], dl + + movd esi, xmm0 + pshufd xmm0, xmm0, 0x39 // 00111001 to rotate right 32 + + movzx edx, byte ptr [eax + 8] + movzx edx, byte ptr [esi + edx] + mov byte ptr [edi + 8], dl + movzx edx, byte ptr [eax + 9] + movzx edx, byte ptr [esi + edx] + mov byte ptr [edi + 9], dl + movzx edx, byte ptr [eax + 10] + movzx edx, byte ptr [esi + edx] + mov byte ptr [edi + 10], dl + movzx edx, byte ptr [eax + 11] // copy alpha. + mov byte ptr [edi + 11], dl + + movd esi, xmm0 + + movzx edx, byte ptr [eax + 12] + movzx edx, byte ptr [esi + edx] + mov byte ptr [edi + 12], dl + movzx edx, byte ptr [eax + 13] + movzx edx, byte ptr [esi + edx] + mov byte ptr [edi + 13], dl + movzx edx, byte ptr [eax + 14] + movzx edx, byte ptr [esi + edx] + mov byte ptr [edi + 14], dl + movzx edx, byte ptr [eax + 15] // copy alpha. + mov byte ptr [edi + 15], dl + + sub ecx, 4 + lea eax, [eax + 16] + lea edi, [edi + 16] + jg convertloop + + pop edi + pop esi + ret + } +} +#endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3 + +#endif // defined(_M_X64) +#endif // !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/TMessagesProj/jni/libyuv/source/row_x86.asm b/TMessagesProj/jni/libyuv/source/row_x86.asm new file mode 100644 index 000000000..0cb326f8e --- /dev/null +++ b/TMessagesProj/jni/libyuv/source/row_x86.asm @@ -0,0 +1,146 @@ +; +; Copyright 2012 The LibYuv Project Authors. All rights reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + +%ifdef __YASM_VERSION_ID__ +%if __YASM_VERSION_ID__ < 01020000h +%error AVX2 is supported only by yasm 1.2.0 or later. +%endif +%endif +%include "x86inc.asm" + +SECTION .text + +; cglobal numeric constants are parameters, gpr regs, mm regs + +; void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix) + +%macro YUY2TOYROW 2-3 +cglobal %1ToYRow%3, 3, 3, 3, src_yuy2, dst_y, pix +%ifidn %1,YUY2 + pcmpeqb m2, m2, m2 ; generate mask 0x00ff00ff + psrlw m2, m2, 8 +%endif + + ALIGN 4 +.convertloop: + mov%2 m0, [src_yuy2q] + mov%2 m1, [src_yuy2q + mmsize] + lea src_yuy2q, [src_yuy2q + mmsize * 2] +%ifidn %1,YUY2 + pand m0, m0, m2 ; YUY2 even bytes are Y + pand m1, m1, m2 +%else + psrlw m0, m0, 8 ; UYVY odd bytes are Y + psrlw m1, m1, 8 +%endif + packuswb m0, m0, m1 +%if cpuflag(AVX2) + vpermq m0, m0, 0xd8 +%endif + sub pixd, mmsize + mov%2 [dst_yq], m0 + lea dst_yq, [dst_yq + mmsize] + jg .convertloop + REP_RET +%endmacro + +; TODO(fbarchard): Remove MMX. Add SSSE3 pshufb version. +INIT_MMX MMX +YUY2TOYROW YUY2,a, +YUY2TOYROW YUY2,u,_Unaligned +YUY2TOYROW UYVY,a, +YUY2TOYROW UYVY,u,_Unaligned +INIT_XMM SSE2 +YUY2TOYROW YUY2,a, +YUY2TOYROW YUY2,u,_Unaligned +YUY2TOYROW UYVY,a, +YUY2TOYROW UYVY,u,_Unaligned +INIT_YMM AVX2 +YUY2TOYROW YUY2,a, +YUY2TOYROW UYVY,a, + +; void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) + +%macro SplitUVRow 1-2 +cglobal SplitUVRow%2, 4, 4, 5, src_uv, dst_u, dst_v, pix + pcmpeqb m4, m4, m4 ; generate mask 0x00ff00ff + psrlw m4, m4, 8 + sub dst_vq, dst_uq + + ALIGN 4 +.convertloop: + mov%1 m0, [src_uvq] + mov%1 m1, [src_uvq + mmsize] + lea src_uvq, [src_uvq + mmsize * 2] + psrlw m2, m0, 8 ; odd bytes + psrlw m3, m1, 8 + pand m0, m0, m4 ; even bytes + pand m1, m1, m4 + packuswb m0, m0, m1 + packuswb m2, m2, m3 +%if cpuflag(AVX2) + vpermq m0, m0, 0xd8 + vpermq m2, m2, 0xd8 +%endif + mov%1 [dst_uq], m0 + mov%1 [dst_uq + dst_vq], m2 + lea dst_uq, [dst_uq + mmsize] + sub pixd, mmsize + jg .convertloop + REP_RET +%endmacro + +INIT_MMX MMX +SplitUVRow a, +SplitUVRow u,_Unaligned +INIT_XMM SSE2 +SplitUVRow a, +SplitUVRow u,_Unaligned +INIT_YMM AVX2 +SplitUVRow a, + +; void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, +; int width); + +%macro MergeUVRow_ 1-2 +cglobal MergeUVRow_%2, 4, 4, 3, src_u, src_v, dst_uv, pix + sub src_vq, src_uq + + ALIGN 4 +.convertloop: + mov%1 m0, [src_uq] + mov%1 m1, [src_vq] + lea src_uq, [src_uq + mmsize] + punpcklbw m2, m0, m1 // first 8 UV pairs + punpckhbw m0, m0, m1 // next 8 UV pairs +%if cpuflag(AVX2) + vperm2i128 m1, m2, m0, 0x20 // low 128 of ymm2 and low 128 of ymm0 + vperm2i128 m2, m2, m0, 0x31 // high 128 of ymm2 and high 128 of ymm0 + mov%1 [dst_uvq], m1 + mov%1 [dst_uvq + mmsize], m2 +%else + mov%1 [dst_uvq], m2 + mov%1 [dst_uvq + mmsize], m0 +%endif + lea dst_uvq, [dst_uvq + mmsize * 2] + sub pixd, mmsize + jg .convertloop + REP_RET +%endmacro + +INIT_MMX MMX +MergeUVRow_ a, +MergeUVRow_ u,_Unaligned +INIT_XMM SSE2 +MergeUVRow_ a, +MergeUVRow_ u,_Unaligned +INIT_YMM AVX2 +MergeUVRow_ a, + diff --git a/TMessagesProj/jni/libyuv/source/scale.cc b/TMessagesProj/jni/libyuv/source/scale.cc new file mode 100644 index 000000000..5b33b5f04 --- /dev/null +++ b/TMessagesProj/jni/libyuv/source/scale.cc @@ -0,0 +1,1716 @@ +/* + * Copyright 2011 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/scale.h" + +#include +#include + +#include "libyuv/cpu_id.h" +#include "libyuv/planar_functions.h" // For CopyPlane +#include "libyuv/row.h" +#include "libyuv/scale_row.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// Remove this macro if OVERREAD is safe. +#define AVOID_OVERREAD 1 + +static __inline int Abs(int v) { + return v >= 0 ? v : -v; +} + +#define SUBSAMPLE(v, a, s) (v < 0) ? (-((-v + a) >> s)) : ((v + a) >> s) + +// Scale plane, 1/2 +// This is an optimized version for scaling down a plane to 1/2 of +// its original size. + +static void ScalePlaneDown2(int src_width, int src_height, + int dst_width, int dst_height, + int src_stride, int dst_stride, + const uint8* src_ptr, uint8* dst_ptr, + enum FilterMode filtering) { + int y; + void (*ScaleRowDown2)(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) = + filtering == kFilterNone ? ScaleRowDown2_C : + (filtering == kFilterLinear ? ScaleRowDown2Linear_C : + ScaleRowDown2Box_C); + int row_stride = src_stride << 1; + if (!filtering) { + src_ptr += src_stride; // Point to odd rows. + src_stride = 0; + } + +#if defined(HAS_SCALEROWDOWN2_NEON) + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 16)) { + ScaleRowDown2 = filtering ? ScaleRowDown2Box_NEON : ScaleRowDown2_NEON; + } +#elif defined(HAS_SCALEROWDOWN2_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 16)) { + ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_Unaligned_SSE2 : + (filtering == kFilterLinear ? ScaleRowDown2Linear_Unaligned_SSE2 : + ScaleRowDown2Box_Unaligned_SSE2); + if (IS_ALIGNED(src_ptr, 16) && + IS_ALIGNED(src_stride, 16) && IS_ALIGNED(row_stride, 16) && + IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) { + ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_SSE2 : + (filtering == kFilterLinear ? ScaleRowDown2Linear_SSE2 : + ScaleRowDown2Box_SSE2); + } + } +#elif defined(HAS_SCALEROWDOWN2_MIPS_DSPR2) + if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(src_ptr, 4) && + IS_ALIGNED(src_stride, 4) && IS_ALIGNED(row_stride, 4) && + IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) { + ScaleRowDown2 = filtering ? + ScaleRowDown2Box_MIPS_DSPR2 : ScaleRowDown2_MIPS_DSPR2; + } +#endif + + if (filtering == kFilterLinear) { + src_stride = 0; + } + // TODO(fbarchard): Loop through source height to allow odd height. + for (y = 0; y < dst_height; ++y) { + ScaleRowDown2(src_ptr, src_stride, dst_ptr, dst_width); + src_ptr += row_stride; + dst_ptr += dst_stride; + } +} + +static void ScalePlaneDown2_16(int src_width, int src_height, + int dst_width, int dst_height, + int src_stride, int dst_stride, + const uint16* src_ptr, uint16* dst_ptr, + enum FilterMode filtering) { + int y; + void (*ScaleRowDown2)(const uint16* src_ptr, ptrdiff_t src_stride, + uint16* dst_ptr, int dst_width) = + filtering == kFilterNone ? ScaleRowDown2_16_C : + (filtering == kFilterLinear ? ScaleRowDown2Linear_16_C : + ScaleRowDown2Box_16_C); + int row_stride = src_stride << 1; + if (!filtering) { + src_ptr += src_stride; // Point to odd rows. + src_stride = 0; + } + +#if defined(HAS_SCALEROWDOWN2_16_NEON) + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 16)) { + ScaleRowDown2 = filtering ? ScaleRowDown2Box_16_NEON : + ScaleRowDown2_16_NEON; + } +#elif defined(HAS_SCALEROWDOWN2_16_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 16)) { + ScaleRowDown2 = filtering == kFilterNone ? + ScaleRowDown2_Unaligned_16_SSE2 : + (filtering == kFilterLinear ? ScaleRowDown2Linear_Unaligned_16_SSE2 : + ScaleRowDown2Box_Unaligned_16_SSE2); + if (IS_ALIGNED(src_ptr, 16) && + IS_ALIGNED(src_stride, 16) && IS_ALIGNED(row_stride, 16) && + IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) { + ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_16_SSE2 : + (filtering == kFilterLinear ? ScaleRowDown2Linear_16_SSE2 : + ScaleRowDown2Box_16_SSE2); + } + } +#elif defined(HAS_SCALEROWDOWN2_16_MIPS_DSPR2) + if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(src_ptr, 4) && + IS_ALIGNED(src_stride, 4) && IS_ALIGNED(row_stride, 4) && + IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) { + ScaleRowDown2 = filtering ? + ScaleRowDown2Box_16_MIPS_DSPR2 : ScaleRowDown2_16_MIPS_DSPR2; + } +#endif + + if (filtering == kFilterLinear) { + src_stride = 0; + } + // TODO(fbarchard): Loop through source height to allow odd height. + for (y = 0; y < dst_height; ++y) { + ScaleRowDown2(src_ptr, src_stride, dst_ptr, dst_width); + src_ptr += row_stride; + dst_ptr += dst_stride; + } +} + +// Scale plane, 1/4 +// This is an optimized version for scaling down a plane to 1/4 of +// its original size. + +static void ScalePlaneDown4(int src_width, int src_height, + int dst_width, int dst_height, + int src_stride, int dst_stride, + const uint8* src_ptr, uint8* dst_ptr, + enum FilterMode filtering) { + int y; + void (*ScaleRowDown4)(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) = + filtering ? ScaleRowDown4Box_C : ScaleRowDown4_C; + int row_stride = src_stride << 2; + if (!filtering) { + src_ptr += src_stride * 2; // Point to row 2. + src_stride = 0; + } +#if defined(HAS_SCALEROWDOWN4_NEON) + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 8)) { + ScaleRowDown4 = filtering ? ScaleRowDown4Box_NEON : ScaleRowDown4_NEON; + } +#elif defined(HAS_SCALEROWDOWN4_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && + IS_ALIGNED(dst_width, 8) && IS_ALIGNED(row_stride, 16) && + IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) { + ScaleRowDown4 = filtering ? ScaleRowDown4Box_SSE2 : ScaleRowDown4_SSE2; + } +#elif defined(HAS_SCALEROWDOWN4_MIPS_DSPR2) + if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(row_stride, 4) && + IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) && + IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) { + ScaleRowDown4 = filtering ? + ScaleRowDown4Box_MIPS_DSPR2 : ScaleRowDown4_MIPS_DSPR2; + } +#endif + + if (filtering == kFilterLinear) { + src_stride = 0; + } + for (y = 0; y < dst_height; ++y) { + ScaleRowDown4(src_ptr, src_stride, dst_ptr, dst_width); + src_ptr += row_stride; + dst_ptr += dst_stride; + } +} + +static void ScalePlaneDown4_16(int src_width, int src_height, + int dst_width, int dst_height, + int src_stride, int dst_stride, + const uint16* src_ptr, uint16* dst_ptr, + enum FilterMode filtering) { + int y; + void (*ScaleRowDown4)(const uint16* src_ptr, ptrdiff_t src_stride, + uint16* dst_ptr, int dst_width) = + filtering ? ScaleRowDown4Box_16_C : ScaleRowDown4_16_C; + int row_stride = src_stride << 2; + if (!filtering) { + src_ptr += src_stride * 2; // Point to row 2. + src_stride = 0; + } +#if defined(HAS_SCALEROWDOWN4_16_NEON) + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 8)) { + ScaleRowDown4 = filtering ? ScaleRowDown4Box_16_NEON : + ScaleRowDown4_16_NEON; + } +#elif defined(HAS_SCALEROWDOWN4_16_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && + IS_ALIGNED(dst_width, 8) && IS_ALIGNED(row_stride, 16) && + IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) { + ScaleRowDown4 = filtering ? ScaleRowDown4Box_16_SSE2 : + ScaleRowDown4_16_SSE2; + } +#elif defined(HAS_SCALEROWDOWN4_16_MIPS_DSPR2) + if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(row_stride, 4) && + IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) && + IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) { + ScaleRowDown4 = filtering ? + ScaleRowDown4Box_16_MIPS_DSPR2 : ScaleRowDown4_16_MIPS_DSPR2; + } +#endif + + if (filtering == kFilterLinear) { + src_stride = 0; + } + for (y = 0; y < dst_height; ++y) { + ScaleRowDown4(src_ptr, src_stride, dst_ptr, dst_width); + src_ptr += row_stride; + dst_ptr += dst_stride; + } +} + +// Scale plane down, 3/4 + +static void ScalePlaneDown34(int src_width, int src_height, + int dst_width, int dst_height, + int src_stride, int dst_stride, + const uint8* src_ptr, uint8* dst_ptr, + enum FilterMode filtering) { + int y; + void (*ScaleRowDown34_0)(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); + void (*ScaleRowDown34_1)(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); + const int filter_stride = (filtering == kFilterLinear) ? 0 : src_stride; + assert(dst_width % 3 == 0); + if (!filtering) { + ScaleRowDown34_0 = ScaleRowDown34_C; + ScaleRowDown34_1 = ScaleRowDown34_C; + } else { + ScaleRowDown34_0 = ScaleRowDown34_0_Box_C; + ScaleRowDown34_1 = ScaleRowDown34_1_Box_C; + } +#if defined(HAS_SCALEROWDOWN34_NEON) + if (TestCpuFlag(kCpuHasNEON) && (dst_width % 24 == 0)) { + if (!filtering) { + ScaleRowDown34_0 = ScaleRowDown34_NEON; + ScaleRowDown34_1 = ScaleRowDown34_NEON; + } else { + ScaleRowDown34_0 = ScaleRowDown34_0_Box_NEON; + ScaleRowDown34_1 = ScaleRowDown34_1_Box_NEON; + } + } +#endif +#if defined(HAS_SCALEROWDOWN34_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && (dst_width % 24 == 0) && + IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) { + if (!filtering) { + ScaleRowDown34_0 = ScaleRowDown34_SSSE3; + ScaleRowDown34_1 = ScaleRowDown34_SSSE3; + } else { + ScaleRowDown34_0 = ScaleRowDown34_0_Box_SSSE3; + ScaleRowDown34_1 = ScaleRowDown34_1_Box_SSSE3; + } + } +#endif +#if defined(HAS_SCALEROWDOWN34_MIPS_DSPR2) + if (TestCpuFlag(kCpuHasMIPS_DSPR2) && (dst_width % 24 == 0) && + IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) && + IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) { + if (!filtering) { + ScaleRowDown34_0 = ScaleRowDown34_MIPS_DSPR2; + ScaleRowDown34_1 = ScaleRowDown34_MIPS_DSPR2; + } else { + ScaleRowDown34_0 = ScaleRowDown34_0_Box_MIPS_DSPR2; + ScaleRowDown34_1 = ScaleRowDown34_1_Box_MIPS_DSPR2; + } + } +#endif + + for (y = 0; y < dst_height - 2; y += 3) { + ScaleRowDown34_0(src_ptr, filter_stride, dst_ptr, dst_width); + src_ptr += src_stride; + dst_ptr += dst_stride; + ScaleRowDown34_1(src_ptr, filter_stride, dst_ptr, dst_width); + src_ptr += src_stride; + dst_ptr += dst_stride; + ScaleRowDown34_0(src_ptr + src_stride, -filter_stride, + dst_ptr, dst_width); + src_ptr += src_stride * 2; + dst_ptr += dst_stride; + } + + // Remainder 1 or 2 rows with last row vertically unfiltered + if ((dst_height % 3) == 2) { + ScaleRowDown34_0(src_ptr, filter_stride, dst_ptr, dst_width); + src_ptr += src_stride; + dst_ptr += dst_stride; + ScaleRowDown34_1(src_ptr, 0, dst_ptr, dst_width); + } else if ((dst_height % 3) == 1) { + ScaleRowDown34_0(src_ptr, 0, dst_ptr, dst_width); + } +} + +static void ScalePlaneDown34_16(int src_width, int src_height, + int dst_width, int dst_height, + int src_stride, int dst_stride, + const uint16* src_ptr, uint16* dst_ptr, + enum FilterMode filtering) { + int y; + void (*ScaleRowDown34_0)(const uint16* src_ptr, ptrdiff_t src_stride, + uint16* dst_ptr, int dst_width); + void (*ScaleRowDown34_1)(const uint16* src_ptr, ptrdiff_t src_stride, + uint16* dst_ptr, int dst_width); + const int filter_stride = (filtering == kFilterLinear) ? 0 : src_stride; + assert(dst_width % 3 == 0); + if (!filtering) { + ScaleRowDown34_0 = ScaleRowDown34_16_C; + ScaleRowDown34_1 = ScaleRowDown34_16_C; + } else { + ScaleRowDown34_0 = ScaleRowDown34_0_Box_16_C; + ScaleRowDown34_1 = ScaleRowDown34_1_Box_16_C; + } +#if defined(HAS_SCALEROWDOWN34_16_NEON) + if (TestCpuFlag(kCpuHasNEON) && (dst_width % 24 == 0)) { + if (!filtering) { + ScaleRowDown34_0 = ScaleRowDown34_16_NEON; + ScaleRowDown34_1 = ScaleRowDown34_16_NEON; + } else { + ScaleRowDown34_0 = ScaleRowDown34_0_Box_16_NEON; + ScaleRowDown34_1 = ScaleRowDown34_1_Box_16_NEON; + } + } +#endif +#if defined(HAS_SCALEROWDOWN34_16_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && (dst_width % 24 == 0) && + IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) { + if (!filtering) { + ScaleRowDown34_0 = ScaleRowDown34_16_SSSE3; + ScaleRowDown34_1 = ScaleRowDown34_16_SSSE3; + } else { + ScaleRowDown34_0 = ScaleRowDown34_0_Box_16_SSSE3; + ScaleRowDown34_1 = ScaleRowDown34_1_Box_16_SSSE3; + } + } +#endif +#if defined(HAS_SCALEROWDOWN34_16_MIPS_DSPR2) + if (TestCpuFlag(kCpuHasMIPS_DSPR2) && (dst_width % 24 == 0) && + IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) && + IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) { + if (!filtering) { + ScaleRowDown34_0 = ScaleRowDown34_16_MIPS_DSPR2; + ScaleRowDown34_1 = ScaleRowDown34_16_MIPS_DSPR2; + } else { + ScaleRowDown34_0 = ScaleRowDown34_0_Box_16_MIPS_DSPR2; + ScaleRowDown34_1 = ScaleRowDown34_1_Box_16_MIPS_DSPR2; + } + } +#endif + + for (y = 0; y < dst_height - 2; y += 3) { + ScaleRowDown34_0(src_ptr, filter_stride, dst_ptr, dst_width); + src_ptr += src_stride; + dst_ptr += dst_stride; + ScaleRowDown34_1(src_ptr, filter_stride, dst_ptr, dst_width); + src_ptr += src_stride; + dst_ptr += dst_stride; + ScaleRowDown34_0(src_ptr + src_stride, -filter_stride, + dst_ptr, dst_width); + src_ptr += src_stride * 2; + dst_ptr += dst_stride; + } + + // Remainder 1 or 2 rows with last row vertically unfiltered + if ((dst_height % 3) == 2) { + ScaleRowDown34_0(src_ptr, filter_stride, dst_ptr, dst_width); + src_ptr += src_stride; + dst_ptr += dst_stride; + ScaleRowDown34_1(src_ptr, 0, dst_ptr, dst_width); + } else if ((dst_height % 3) == 1) { + ScaleRowDown34_0(src_ptr, 0, dst_ptr, dst_width); + } +} + + +// Scale plane, 3/8 +// This is an optimized version for scaling down a plane to 3/8 +// of its original size. +// +// Uses box filter arranges like this +// aaabbbcc -> abc +// aaabbbcc def +// aaabbbcc ghi +// dddeeeff +// dddeeeff +// dddeeeff +// ggghhhii +// ggghhhii +// Boxes are 3x3, 2x3, 3x2 and 2x2 + +static void ScalePlaneDown38(int src_width, int src_height, + int dst_width, int dst_height, + int src_stride, int dst_stride, + const uint8* src_ptr, uint8* dst_ptr, + enum FilterMode filtering) { + int y; + void (*ScaleRowDown38_3)(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); + void (*ScaleRowDown38_2)(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); + const int filter_stride = (filtering == kFilterLinear) ? 0 : src_stride; + assert(dst_width % 3 == 0); + if (!filtering) { + ScaleRowDown38_3 = ScaleRowDown38_C; + ScaleRowDown38_2 = ScaleRowDown38_C; + } else { + ScaleRowDown38_3 = ScaleRowDown38_3_Box_C; + ScaleRowDown38_2 = ScaleRowDown38_2_Box_C; + } +#if defined(HAS_SCALEROWDOWN38_NEON) + if (TestCpuFlag(kCpuHasNEON) && (dst_width % 12 == 0)) { + if (!filtering) { + ScaleRowDown38_3 = ScaleRowDown38_NEON; + ScaleRowDown38_2 = ScaleRowDown38_NEON; + } else { + ScaleRowDown38_3 = ScaleRowDown38_3_Box_NEON; + ScaleRowDown38_2 = ScaleRowDown38_2_Box_NEON; + } + } +#elif defined(HAS_SCALEROWDOWN38_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && (dst_width % 24 == 0) && + IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) { + if (!filtering) { + ScaleRowDown38_3 = ScaleRowDown38_SSSE3; + ScaleRowDown38_2 = ScaleRowDown38_SSSE3; + } else { + ScaleRowDown38_3 = ScaleRowDown38_3_Box_SSSE3; + ScaleRowDown38_2 = ScaleRowDown38_2_Box_SSSE3; + } + } +#elif defined(HAS_SCALEROWDOWN38_MIPS_DSPR2) + if (TestCpuFlag(kCpuHasMIPS_DSPR2) && (dst_width % 12 == 0) && + IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) && + IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) { + if (!filtering) { + ScaleRowDown38_3 = ScaleRowDown38_MIPS_DSPR2; + ScaleRowDown38_2 = ScaleRowDown38_MIPS_DSPR2; + } else { + ScaleRowDown38_3 = ScaleRowDown38_3_Box_MIPS_DSPR2; + ScaleRowDown38_2 = ScaleRowDown38_2_Box_MIPS_DSPR2; + } + } +#endif + + for (y = 0; y < dst_height - 2; y += 3) { + ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width); + src_ptr += src_stride * 3; + dst_ptr += dst_stride; + ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width); + src_ptr += src_stride * 3; + dst_ptr += dst_stride; + ScaleRowDown38_2(src_ptr, filter_stride, dst_ptr, dst_width); + src_ptr += src_stride * 2; + dst_ptr += dst_stride; + } + + // Remainder 1 or 2 rows with last row vertically unfiltered + if ((dst_height % 3) == 2) { + ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width); + src_ptr += src_stride * 3; + dst_ptr += dst_stride; + ScaleRowDown38_3(src_ptr, 0, dst_ptr, dst_width); + } else if ((dst_height % 3) == 1) { + ScaleRowDown38_3(src_ptr, 0, dst_ptr, dst_width); + } +} + +static void ScalePlaneDown38_16(int src_width, int src_height, + int dst_width, int dst_height, + int src_stride, int dst_stride, + const uint16* src_ptr, uint16* dst_ptr, + enum FilterMode filtering) { + int y; + void (*ScaleRowDown38_3)(const uint16* src_ptr, ptrdiff_t src_stride, + uint16* dst_ptr, int dst_width); + void (*ScaleRowDown38_2)(const uint16* src_ptr, ptrdiff_t src_stride, + uint16* dst_ptr, int dst_width); + const int filter_stride = (filtering == kFilterLinear) ? 0 : src_stride; + assert(dst_width % 3 == 0); + if (!filtering) { + ScaleRowDown38_3 = ScaleRowDown38_16_C; + ScaleRowDown38_2 = ScaleRowDown38_16_C; + } else { + ScaleRowDown38_3 = ScaleRowDown38_3_Box_16_C; + ScaleRowDown38_2 = ScaleRowDown38_2_Box_16_C; + } +#if defined(HAS_SCALEROWDOWN38_16_NEON) + if (TestCpuFlag(kCpuHasNEON) && (dst_width % 12 == 0)) { + if (!filtering) { + ScaleRowDown38_3 = ScaleRowDown38_16_NEON; + ScaleRowDown38_2 = ScaleRowDown38_16_NEON; + } else { + ScaleRowDown38_3 = ScaleRowDown38_3_Box_16_NEON; + ScaleRowDown38_2 = ScaleRowDown38_2_Box_16_NEON; + } + } +#elif defined(HAS_SCALEROWDOWN38_16_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && (dst_width % 24 == 0) && + IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) { + if (!filtering) { + ScaleRowDown38_3 = ScaleRowDown38_16_SSSE3; + ScaleRowDown38_2 = ScaleRowDown38_16_SSSE3; + } else { + ScaleRowDown38_3 = ScaleRowDown38_3_Box_16_SSSE3; + ScaleRowDown38_2 = ScaleRowDown38_2_Box_16_SSSE3; + } + } +#elif defined(HAS_SCALEROWDOWN38_16_MIPS_DSPR2) + if (TestCpuFlag(kCpuHasMIPS_DSPR2) && (dst_width % 12 == 0) && + IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) && + IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) { + if (!filtering) { + ScaleRowDown38_3 = ScaleRowDown38_16_MIPS_DSPR2; + ScaleRowDown38_2 = ScaleRowDown38_16_MIPS_DSPR2; + } else { + ScaleRowDown38_3 = ScaleRowDown38_3_Box_16_MIPS_DSPR2; + ScaleRowDown38_2 = ScaleRowDown38_2_Box_16_MIPS_DSPR2; + } + } +#endif + + for (y = 0; y < dst_height - 2; y += 3) { + ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width); + src_ptr += src_stride * 3; + dst_ptr += dst_stride; + ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width); + src_ptr += src_stride * 3; + dst_ptr += dst_stride; + ScaleRowDown38_2(src_ptr, filter_stride, dst_ptr, dst_width); + src_ptr += src_stride * 2; + dst_ptr += dst_stride; + } + + // Remainder 1 or 2 rows with last row vertically unfiltered + if ((dst_height % 3) == 2) { + ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width); + src_ptr += src_stride * 3; + dst_ptr += dst_stride; + ScaleRowDown38_3(src_ptr, 0, dst_ptr, dst_width); + } else if ((dst_height % 3) == 1) { + ScaleRowDown38_3(src_ptr, 0, dst_ptr, dst_width); + } +} + +static __inline uint32 SumBox(int iboxwidth, int iboxheight, + ptrdiff_t src_stride, const uint8* src_ptr) { + uint32 sum = 0u; + int y; + assert(iboxwidth > 0); + assert(iboxheight > 0); + for (y = 0; y < iboxheight; ++y) { + int x; + for (x = 0; x < iboxwidth; ++x) { + sum += src_ptr[x]; + } + src_ptr += src_stride; + } + return sum; +} + +static __inline uint32 SumBox_16(int iboxwidth, int iboxheight, + ptrdiff_t src_stride, const uint16* src_ptr) { + uint32 sum = 0u; + int y; + assert(iboxwidth > 0); + assert(iboxheight > 0); + for (y = 0; y < iboxheight; ++y) { + int x; + for (x = 0; x < iboxwidth; ++x) { + sum += src_ptr[x]; + } + src_ptr += src_stride; + } + return sum; +} + +static void ScalePlaneBoxRow_C(int dst_width, int boxheight, + int x, int dx, ptrdiff_t src_stride, + const uint8* src_ptr, uint8* dst_ptr) { + int i; + int boxwidth; + for (i = 0; i < dst_width; ++i) { + int ix = x >> 16; + x += dx; + boxwidth = (x >> 16) - ix; + *dst_ptr++ = SumBox(boxwidth, boxheight, src_stride, src_ptr + ix) / + (boxwidth * boxheight); + } +} + +static void ScalePlaneBoxRow_16_C(int dst_width, int boxheight, + int x, int dx, ptrdiff_t src_stride, + const uint16* src_ptr, uint16* dst_ptr) { + int i; + int boxwidth; + for (i = 0; i < dst_width; ++i) { + int ix = x >> 16; + x += dx; + boxwidth = (x >> 16) - ix; + *dst_ptr++ = SumBox_16(boxwidth, boxheight, src_stride, src_ptr + ix) / + (boxwidth * boxheight); + } +} + +static __inline uint32 SumPixels(int iboxwidth, const uint16* src_ptr) { + uint32 sum = 0u; + int x; + assert(iboxwidth > 0); + for (x = 0; x < iboxwidth; ++x) { + sum += src_ptr[x]; + } + return sum; +} + +static __inline uint32 SumPixels_16(int iboxwidth, const uint32* src_ptr) { + uint32 sum = 0u; + int x; + assert(iboxwidth > 0); + for (x = 0; x < iboxwidth; ++x) { + sum += src_ptr[x]; + } + return sum; +} + +static void ScaleAddCols2_C(int dst_width, int boxheight, int x, int dx, + const uint16* src_ptr, uint8* dst_ptr) { + int i; + int scaletbl[2]; + int minboxwidth = (dx >> 16); + int* scaleptr = scaletbl - minboxwidth; + int boxwidth; + scaletbl[0] = 65536 / (minboxwidth * boxheight); + scaletbl[1] = 65536 / ((minboxwidth + 1) * boxheight); + for (i = 0; i < dst_width; ++i) { + int ix = x >> 16; + x += dx; + boxwidth = (x >> 16) - ix; + *dst_ptr++ = SumPixels(boxwidth, src_ptr + ix) * scaleptr[boxwidth] >> 16; + } +} + +static void ScaleAddCols2_16_C(int dst_width, int boxheight, int x, int dx, + const uint32* src_ptr, uint16* dst_ptr) { + int i; + int scaletbl[2]; + int minboxwidth = (dx >> 16); + int* scaleptr = scaletbl - minboxwidth; + int boxwidth; + scaletbl[0] = 65536 / (minboxwidth * boxheight); + scaletbl[1] = 65536 / ((minboxwidth + 1) * boxheight); + for (i = 0; i < dst_width; ++i) { + int ix = x >> 16; + x += dx; + boxwidth = (x >> 16) - ix; + *dst_ptr++ = SumPixels_16(boxwidth, src_ptr + ix) * + scaleptr[boxwidth] >> 16; + } +} + +static void ScaleAddCols1_C(int dst_width, int boxheight, int x, int dx, + const uint16* src_ptr, uint8* dst_ptr) { + int boxwidth = (dx >> 16); + int scaleval = 65536 / (boxwidth * boxheight); + int i; + for (i = 0; i < dst_width; ++i) { + *dst_ptr++ = SumPixels(boxwidth, src_ptr + x) * scaleval >> 16; + x += boxwidth; + } +} + +static void ScaleAddCols1_16_C(int dst_width, int boxheight, int x, int dx, + const uint32* src_ptr, uint16* dst_ptr) { + int boxwidth = (dx >> 16); + int scaleval = 65536 / (boxwidth * boxheight); + int i; + for (i = 0; i < dst_width; ++i) { + *dst_ptr++ = SumPixels_16(boxwidth, src_ptr + x) * scaleval >> 16; + x += boxwidth; + } +} + +// Scale plane down to any dimensions, with interpolation. +// (boxfilter). +// +// Same method as SimpleScale, which is fixed point, outputting +// one pixel of destination using fixed point (16.16) to step +// through source, sampling a box of pixel with simple +// averaging. +static void ScalePlaneBox(int src_width, int src_height, + int dst_width, int dst_height, + int src_stride, int dst_stride, + const uint8* src_ptr, uint8* dst_ptr) { + int j; + // Initial source x/y coordinate and step values as 16.16 fixed point. + int x = 0; + int y = 0; + int dx = 0; + int dy = 0; + const int max_y = (src_height << 16); + ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterBox, + &x, &y, &dx, &dy); + src_width = Abs(src_width); + // TODO(fbarchard): Remove this and make AddRows handle boxheight 1. + if (!IS_ALIGNED(src_width, 16) || dst_height * 2 > src_height) { + uint8* dst = dst_ptr; + int j; + for (j = 0; j < dst_height; ++j) { + int boxheight; + int iy = y >> 16; + const uint8* src = src_ptr + iy * src_stride; + y += dy; + if (y > max_y) { + y = max_y; + } + boxheight = (y >> 16) - iy; + ScalePlaneBoxRow_C(dst_width, boxheight, + x, dx, src_stride, + src, dst); + dst += dst_stride; + } + return; + } + { + // Allocate a row buffer of uint16. + align_buffer_64(row16, src_width * 2); + void (*ScaleAddCols)(int dst_width, int boxheight, int x, int dx, + const uint16* src_ptr, uint8* dst_ptr) = + (dx & 0xffff) ? ScaleAddCols2_C: ScaleAddCols1_C; + void (*ScaleAddRows)(const uint8* src_ptr, ptrdiff_t src_stride, + uint16* dst_ptr, int src_width, int src_height) = ScaleAddRows_C; + +#if defined(HAS_SCALEADDROWS_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && +#ifdef AVOID_OVERREAD + IS_ALIGNED(src_width, 16) && +#endif + IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) { + ScaleAddRows = ScaleAddRows_SSE2; + } +#endif + + for (j = 0; j < dst_height; ++j) { + int boxheight; + int iy = y >> 16; + const uint8* src = src_ptr + iy * src_stride; + y += dy; + if (y > (src_height << 16)) { + y = (src_height << 16); + } + boxheight = (y >> 16) - iy; + ScaleAddRows(src, src_stride, (uint16*)(row16), + src_width, boxheight); + ScaleAddCols(dst_width, boxheight, x, dx, (uint16*)(row16), + dst_ptr); + dst_ptr += dst_stride; + } + free_aligned_buffer_64(row16); + } +} + +static void ScalePlaneBox_16(int src_width, int src_height, + int dst_width, int dst_height, + int src_stride, int dst_stride, + const uint16* src_ptr, uint16* dst_ptr) { + int j; + // Initial source x/y coordinate and step values as 16.16 fixed point. + int x = 0; + int y = 0; + int dx = 0; + int dy = 0; + const int max_y = (src_height << 16); + ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterBox, + &x, &y, &dx, &dy); + src_width = Abs(src_width); + // TODO(fbarchard): Remove this and make AddRows handle boxheight 1. + if (!IS_ALIGNED(src_width, 16) || dst_height * 2 > src_height) { + uint16* dst = dst_ptr; + int j; + for (j = 0; j < dst_height; ++j) { + int boxheight; + int iy = y >> 16; + const uint16* src = src_ptr + iy * src_stride; + y += dy; + if (y > max_y) { + y = max_y; + } + boxheight = (y >> 16) - iy; + ScalePlaneBoxRow_16_C(dst_width, boxheight, + x, dx, src_stride, + src, dst); + dst += dst_stride; + } + return; + } + { + // Allocate a row buffer of uint32. + align_buffer_64(row32, src_width * 4); + void (*ScaleAddCols)(int dst_width, int boxheight, int x, int dx, + const uint32* src_ptr, uint16* dst_ptr) = + (dx & 0xffff) ? ScaleAddCols2_16_C: ScaleAddCols1_16_C; + void (*ScaleAddRows)(const uint16* src_ptr, ptrdiff_t src_stride, + uint32* dst_ptr, int src_width, int src_height) = ScaleAddRows_16_C; + +#if defined(HAS_SCALEADDROWS_16_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && +#ifdef AVOID_OVERREAD + IS_ALIGNED(src_width, 16) && +#endif + IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) { + ScaleAddRows = ScaleAddRows_16_SSE2; + } +#endif + + for (j = 0; j < dst_height; ++j) { + int boxheight; + int iy = y >> 16; + const uint16* src = src_ptr + iy * src_stride; + y += dy; + if (y > (src_height << 16)) { + y = (src_height << 16); + } + boxheight = (y >> 16) - iy; + ScaleAddRows(src, src_stride, (uint32*)(row32), + src_width, boxheight); + ScaleAddCols(dst_width, boxheight, x, dx, (uint32*)(row32), + dst_ptr); + dst_ptr += dst_stride; + } + free_aligned_buffer_64(row32); + } +} + +// Scale plane down with bilinear interpolation. +void ScalePlaneBilinearDown(int src_width, int src_height, + int dst_width, int dst_height, + int src_stride, int dst_stride, + const uint8* src_ptr, uint8* dst_ptr, + enum FilterMode filtering) { + // Initial source x/y coordinate and step values as 16.16 fixed point. + int x = 0; + int y = 0; + int dx = 0; + int dy = 0; + // TODO(fbarchard): Consider not allocating row buffer for kFilterLinear. + // Allocate a row buffer. + align_buffer_64(row, src_width); + + const int max_y = (src_height - 1) << 16; + int j; + void (*ScaleFilterCols)(uint8* dst_ptr, const uint8* src_ptr, + int dst_width, int x, int dx) = + (src_width >= 32768) ? ScaleFilterCols64_C : ScaleFilterCols_C; + void (*InterpolateRow)(uint8* dst_ptr, const uint8* src_ptr, + ptrdiff_t src_stride, int dst_width, int source_y_fraction) = + InterpolateRow_C; + ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, + &x, &y, &dx, &dy); + src_width = Abs(src_width); + +#if defined(HAS_INTERPOLATEROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && src_width >= 16) { + InterpolateRow = InterpolateRow_Any_SSE2; + if (IS_ALIGNED(src_width, 16)) { + InterpolateRow = InterpolateRow_Unaligned_SSE2; + if (IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) { + InterpolateRow = InterpolateRow_SSE2; + } + } + } +#endif +#if defined(HAS_INTERPOLATEROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && src_width >= 16) { + InterpolateRow = InterpolateRow_Any_SSSE3; + if (IS_ALIGNED(src_width, 16)) { + InterpolateRow = InterpolateRow_Unaligned_SSSE3; + if (IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) { + InterpolateRow = InterpolateRow_SSSE3; + } + } + } +#endif +#if defined(HAS_INTERPOLATEROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2) && src_width >= 32) { + InterpolateRow = InterpolateRow_Any_AVX2; + if (IS_ALIGNED(src_width, 32)) { + InterpolateRow = InterpolateRow_AVX2; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && src_width >= 16) { + InterpolateRow = InterpolateRow_Any_NEON; + if (IS_ALIGNED(src_width, 16)) { + InterpolateRow = InterpolateRow_NEON; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_MIPS_DSPR2) + if (TestCpuFlag(kCpuHasMIPS_DSPR2) && src_width >= 4) { + InterpolateRow = InterpolateRow_Any_MIPS_DSPR2; + if (IS_ALIGNED(src_width, 4)) { + InterpolateRow = InterpolateRow_MIPS_DSPR2; + } + } +#endif + + +#if defined(HAS_SCALEFILTERCOLS_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) { + ScaleFilterCols = ScaleFilterCols_SSSE3; + } +#endif + if (y > max_y) { + y = max_y; + } + + for (j = 0; j < dst_height; ++j) { + int yi = y >> 16; + const uint8* src = src_ptr + yi * src_stride; + if (filtering == kFilterLinear) { + ScaleFilterCols(dst_ptr, src, dst_width, x, dx); + } else { + int yf = (y >> 8) & 255; + InterpolateRow(row, src, src_stride, src_width, yf); + ScaleFilterCols(dst_ptr, row, dst_width, x, dx); + } + dst_ptr += dst_stride; + y += dy; + if (y > max_y) { + y = max_y; + } + } + free_aligned_buffer_64(row); +} + +void ScalePlaneBilinearDown_16(int src_width, int src_height, + int dst_width, int dst_height, + int src_stride, int dst_stride, + const uint16* src_ptr, uint16* dst_ptr, + enum FilterMode filtering) { + // Initial source x/y coordinate and step values as 16.16 fixed point. + int x = 0; + int y = 0; + int dx = 0; + int dy = 0; + // TODO(fbarchard): Consider not allocating row buffer for kFilterLinear. + // Allocate a row buffer. + align_buffer_64(row, src_width * 2); + + const int max_y = (src_height - 1) << 16; + int j; + void (*ScaleFilterCols)(uint16* dst_ptr, const uint16* src_ptr, + int dst_width, int x, int dx) = + (src_width >= 32768) ? ScaleFilterCols64_16_C : ScaleFilterCols_16_C; + void (*InterpolateRow)(uint16* dst_ptr, const uint16* src_ptr, + ptrdiff_t src_stride, int dst_width, int source_y_fraction) = + InterpolateRow_16_C; + ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, + &x, &y, &dx, &dy); + src_width = Abs(src_width); + +#if defined(HAS_INTERPOLATEROW_16_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && src_width >= 16) { + InterpolateRow = InterpolateRow_Any_16_SSE2; + if (IS_ALIGNED(src_width, 16)) { + InterpolateRow = InterpolateRow_Unaligned_16_SSE2; + if (IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) { + InterpolateRow = InterpolateRow_16_SSE2; + } + } + } +#endif +#if defined(HAS_INTERPOLATEROW_16_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && src_width >= 16) { + InterpolateRow = InterpolateRow_Any_16_SSSE3; + if (IS_ALIGNED(src_width, 16)) { + InterpolateRow = InterpolateRow_Unaligned_16_SSSE3; + if (IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) { + InterpolateRow = InterpolateRow_16_SSSE3; + } + } + } +#endif +#if defined(HAS_INTERPOLATEROW_16_AVX2) + if (TestCpuFlag(kCpuHasAVX2) && src_width >= 32) { + InterpolateRow = InterpolateRow_Any_16_AVX2; + if (IS_ALIGNED(src_width, 32)) { + InterpolateRow = InterpolateRow_16_AVX2; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_16_NEON) + if (TestCpuFlag(kCpuHasNEON) && src_width >= 16) { + InterpolateRow = InterpolateRow_Any_16_NEON; + if (IS_ALIGNED(src_width, 16)) { + InterpolateRow = InterpolateRow_16_NEON; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_16_MIPS_DSPR2) + if (TestCpuFlag(kCpuHasMIPS_DSPR2) && src_width >= 4) { + InterpolateRow = InterpolateRow_Any_16_MIPS_DSPR2; + if (IS_ALIGNED(src_width, 4)) { + InterpolateRow = InterpolateRow_16_MIPS_DSPR2; + } + } +#endif + + +#if defined(HAS_SCALEFILTERCOLS_16_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) { + ScaleFilterCols = ScaleFilterCols_16_SSSE3; + } +#endif + if (y > max_y) { + y = max_y; + } + + for (j = 0; j < dst_height; ++j) { + int yi = y >> 16; + const uint16* src = src_ptr + yi * src_stride; + if (filtering == kFilterLinear) { + ScaleFilterCols(dst_ptr, src, dst_width, x, dx); + } else { + int yf = (y >> 8) & 255; + InterpolateRow((uint16*)row, src, src_stride, src_width, yf); + ScaleFilterCols(dst_ptr, (uint16*)row, dst_width, x, dx); + } + dst_ptr += dst_stride; + y += dy; + if (y > max_y) { + y = max_y; + } + } + free_aligned_buffer_64(row); +} + +// Scale up down with bilinear interpolation. +void ScalePlaneBilinearUp(int src_width, int src_height, + int dst_width, int dst_height, + int src_stride, int dst_stride, + const uint8* src_ptr, uint8* dst_ptr, + enum FilterMode filtering) { + int j; + // Initial source x/y coordinate and step values as 16.16 fixed point. + int x = 0; + int y = 0; + int dx = 0; + int dy = 0; + const int max_y = (src_height - 1) << 16; + void (*InterpolateRow)(uint8* dst_ptr, const uint8* src_ptr, + ptrdiff_t src_stride, int dst_width, int source_y_fraction) = + InterpolateRow_C; + void (*ScaleFilterCols)(uint8* dst_ptr, const uint8* src_ptr, + int dst_width, int x, int dx) = + filtering ? ScaleFilterCols_C : ScaleCols_C; + ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, + &x, &y, &dx, &dy); + src_width = Abs(src_width); + +#if defined(HAS_INTERPOLATEROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && dst_width >= 16) { + InterpolateRow = InterpolateRow_Any_SSE2; + if (IS_ALIGNED(dst_width, 16)) { + InterpolateRow = InterpolateRow_Unaligned_SSE2; + if (IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) { + InterpolateRow = InterpolateRow_SSE2; + } + } + } +#endif +#if defined(HAS_INTERPOLATEROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && dst_width >= 16) { + InterpolateRow = InterpolateRow_Any_SSSE3; + if (IS_ALIGNED(dst_width, 16)) { + InterpolateRow = InterpolateRow_Unaligned_SSSE3; + if (IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) { + InterpolateRow = InterpolateRow_SSSE3; + } + } + } +#endif +#if defined(HAS_INTERPOLATEROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2) && dst_width >= 32) { + InterpolateRow = InterpolateRow_Any_AVX2; + if (IS_ALIGNED(dst_width, 32)) { + InterpolateRow = InterpolateRow_AVX2; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && dst_width >= 16) { + InterpolateRow = InterpolateRow_Any_NEON; + if (IS_ALIGNED(dst_width, 16)) { + InterpolateRow = InterpolateRow_NEON; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_MIPS_DSPR2) + if (TestCpuFlag(kCpuHasMIPS_DSPR2) && dst_width >= 4) { + InterpolateRow = InterpolateRow_Any_MIPS_DSPR2; + if (IS_ALIGNED(dst_width, 4)) { + InterpolateRow = InterpolateRow_MIPS_DSPR2; + } + } +#endif + + if (filtering && src_width >= 32768) { + ScaleFilterCols = ScaleFilterCols64_C; + } +#if defined(HAS_SCALEFILTERCOLS_SSSE3) + if (filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) { + ScaleFilterCols = ScaleFilterCols_SSSE3; + } +#endif + if (!filtering && src_width * 2 == dst_width && x < 0x8000) { + ScaleFilterCols = ScaleColsUp2_C; +#if defined(HAS_SCALECOLS_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8) && + IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16) && + IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) { + ScaleFilterCols = ScaleColsUp2_SSE2; + } +#endif + } + + if (y > max_y) { + y = max_y; + } + { + int yi = y >> 16; + const uint8* src = src_ptr + yi * src_stride; + + // Allocate 2 row buffers. + const int kRowSize = (dst_width + 15) & ~15; + align_buffer_64(row, kRowSize * 2); + + uint8* rowptr = row; + int rowstride = kRowSize; + int lasty = yi; + + ScaleFilterCols(rowptr, src, dst_width, x, dx); + if (src_height > 1) { + src += src_stride; + } + ScaleFilterCols(rowptr + rowstride, src, dst_width, x, dx); + src += src_stride; + + for (j = 0; j < dst_height; ++j) { + yi = y >> 16; + if (yi != lasty) { + if (y > max_y) { + y = max_y; + yi = y >> 16; + src = src_ptr + yi * src_stride; + } + if (yi != lasty) { + ScaleFilterCols(rowptr, src, dst_width, x, dx); + rowptr += rowstride; + rowstride = -rowstride; + lasty = yi; + src += src_stride; + } + } + if (filtering == kFilterLinear) { + InterpolateRow(dst_ptr, rowptr, 0, dst_width, 0); + } else { + int yf = (y >> 8) & 255; + InterpolateRow(dst_ptr, rowptr, rowstride, dst_width, yf); + } + dst_ptr += dst_stride; + y += dy; + } + free_aligned_buffer_64(row); + } +} + +void ScalePlaneBilinearUp_16(int src_width, int src_height, + int dst_width, int dst_height, + int src_stride, int dst_stride, + const uint16* src_ptr, uint16* dst_ptr, + enum FilterMode filtering) { + int j; + // Initial source x/y coordinate and step values as 16.16 fixed point. + int x = 0; + int y = 0; + int dx = 0; + int dy = 0; + const int max_y = (src_height - 1) << 16; + void (*InterpolateRow)(uint16* dst_ptr, const uint16* src_ptr, + ptrdiff_t src_stride, int dst_width, int source_y_fraction) = + InterpolateRow_16_C; + void (*ScaleFilterCols)(uint16* dst_ptr, const uint16* src_ptr, + int dst_width, int x, int dx) = + filtering ? ScaleFilterCols_16_C : ScaleCols_16_C; + ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, + &x, &y, &dx, &dy); + src_width = Abs(src_width); + +#if defined(HAS_INTERPOLATEROW_16_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && dst_width >= 16) { + InterpolateRow = InterpolateRow_Any_16_SSE2; + if (IS_ALIGNED(dst_width, 16)) { + InterpolateRow = InterpolateRow_Unaligned_16_SSE2; + if (IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) { + InterpolateRow = InterpolateRow_16_SSE2; + } + } + } +#endif +#if defined(HAS_INTERPOLATEROW_16_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && dst_width >= 16) { + InterpolateRow = InterpolateRow_Any_16_SSSE3; + if (IS_ALIGNED(dst_width, 16)) { + InterpolateRow = InterpolateRow_Unaligned_16_SSSE3; + if (IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) { + InterpolateRow = InterpolateRow_16_SSSE3; + } + } + } +#endif +#if defined(HAS_INTERPOLATEROW_16_AVX2) + if (TestCpuFlag(kCpuHasAVX2) && dst_width >= 32) { + InterpolateRow = InterpolateRow_Any_16_AVX2; + if (IS_ALIGNED(dst_width, 32)) { + InterpolateRow = InterpolateRow_16_AVX2; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_16_NEON) + if (TestCpuFlag(kCpuHasNEON) && dst_width >= 16) { + InterpolateRow = InterpolateRow_Any_16_NEON; + if (IS_ALIGNED(dst_width, 16)) { + InterpolateRow = InterpolateRow_16_NEON; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_16_MIPS_DSPR2) + if (TestCpuFlag(kCpuHasMIPS_DSPR2) && dst_width >= 4) { + InterpolateRow = InterpolateRow_Any_16_MIPS_DSPR2; + if (IS_ALIGNED(dst_width, 4)) { + InterpolateRow = InterpolateRow_16_MIPS_DSPR2; + } + } +#endif + + if (filtering && src_width >= 32768) { + ScaleFilterCols = ScaleFilterCols64_16_C; + } +#if defined(HAS_SCALEFILTERCOLS_16_SSSE3) + if (filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) { + ScaleFilterCols = ScaleFilterCols_16_SSSE3; + } +#endif + if (!filtering && src_width * 2 == dst_width && x < 0x8000) { + ScaleFilterCols = ScaleColsUp2_16_C; +#if defined(HAS_SCALECOLS_16_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8) && + IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16) && + IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) { + ScaleFilterCols = ScaleColsUp2_16_SSE2; + } +#endif + } + + if (y > max_y) { + y = max_y; + } + { + int yi = y >> 16; + const uint16* src = src_ptr + yi * src_stride; + + // Allocate 2 row buffers. + const int kRowSize = (dst_width + 15) & ~15; + align_buffer_64(row, kRowSize * 4); + + uint16* rowptr = (uint16*)row; + int rowstride = kRowSize; + int lasty = yi; + + ScaleFilterCols(rowptr, src, dst_width, x, dx); + if (src_height > 1) { + src += src_stride; + } + ScaleFilterCols(rowptr + rowstride, src, dst_width, x, dx); + src += src_stride; + + for (j = 0; j < dst_height; ++j) { + yi = y >> 16; + if (yi != lasty) { + if (y > max_y) { + y = max_y; + yi = y >> 16; + src = src_ptr + yi * src_stride; + } + if (yi != lasty) { + ScaleFilterCols(rowptr, src, dst_width, x, dx); + rowptr += rowstride; + rowstride = -rowstride; + lasty = yi; + src += src_stride; + } + } + if (filtering == kFilterLinear) { + InterpolateRow(dst_ptr, rowptr, 0, dst_width, 0); + } else { + int yf = (y >> 8) & 255; + InterpolateRow(dst_ptr, rowptr, rowstride, dst_width, yf); + } + dst_ptr += dst_stride; + y += dy; + } + free_aligned_buffer_64(row); + } +} + +// Scale Plane to/from any dimensions, without interpolation. +// Fixed point math is used for performance: The upper 16 bits +// of x and dx is the integer part of the source position and +// the lower 16 bits are the fixed decimal part. + +static void ScalePlaneSimple(int src_width, int src_height, + int dst_width, int dst_height, + int src_stride, int dst_stride, + const uint8* src_ptr, uint8* dst_ptr) { + int i; + void (*ScaleCols)(uint8* dst_ptr, const uint8* src_ptr, + int dst_width, int x, int dx) = ScaleCols_C; + // Initial source x/y coordinate and step values as 16.16 fixed point. + int x = 0; + int y = 0; + int dx = 0; + int dy = 0; + ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterNone, + &x, &y, &dx, &dy); + src_width = Abs(src_width); + + if (src_width * 2 == dst_width && x < 0x8000) { + ScaleCols = ScaleColsUp2_C; +#if defined(HAS_SCALECOLS_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8) && + IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16) && + IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) { + ScaleCols = ScaleColsUp2_SSE2; + } +#endif + } + + for (i = 0; i < dst_height; ++i) { + ScaleCols(dst_ptr, src_ptr + (y >> 16) * src_stride, + dst_width, x, dx); + dst_ptr += dst_stride; + y += dy; + } +} + +static void ScalePlaneSimple_16(int src_width, int src_height, + int dst_width, int dst_height, + int src_stride, int dst_stride, + const uint16* src_ptr, uint16* dst_ptr) { + int i; + void (*ScaleCols)(uint16* dst_ptr, const uint16* src_ptr, + int dst_width, int x, int dx) = ScaleCols_16_C; + // Initial source x/y coordinate and step values as 16.16 fixed point. + int x = 0; + int y = 0; + int dx = 0; + int dy = 0; + ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterNone, + &x, &y, &dx, &dy); + src_width = Abs(src_width); + + if (src_width * 2 == dst_width && x < 0x8000) { + ScaleCols = ScaleColsUp2_16_C; +#if defined(HAS_SCALECOLS_16_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8) && + IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16) && + IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) { + ScaleCols = ScaleColsUp2_16_SSE2; + } +#endif + } + + for (i = 0; i < dst_height; ++i) { + ScaleCols(dst_ptr, src_ptr + (y >> 16) * src_stride, + dst_width, x, dx); + dst_ptr += dst_stride; + y += dy; + } +} + +// Scale a plane. +// This function dispatches to a specialized scaler based on scale factor. + +LIBYUV_API +void ScalePlane(const uint8* src, int src_stride, + int src_width, int src_height, + uint8* dst, int dst_stride, + int dst_width, int dst_height, + enum FilterMode filtering) { + // Simplify filtering when possible. + filtering = ScaleFilterReduce(src_width, src_height, + dst_width, dst_height, + filtering); + + // Negative height means invert the image. + if (src_height < 0) { + src_height = -src_height; + src = src + (src_height - 1) * src_stride; + src_stride = -src_stride; + } + + // Use specialized scales to improve performance for common resolutions. + // For example, all the 1/2 scalings will use ScalePlaneDown2() + if (dst_width == src_width && dst_height == src_height) { + // Straight copy. + CopyPlane(src, src_stride, dst, dst_stride, dst_width, dst_height); + return; + } + if (dst_width == src_width) { + int dy = FixedDiv(src_height, dst_height); + // Arbitrary scale vertically, but unscaled vertically. + ScalePlaneVertical(src_height, + dst_width, dst_height, + src_stride, dst_stride, src, dst, + 0, 0, dy, 1, filtering); + return; + } + if (dst_width <= Abs(src_width) && dst_height <= src_height) { + // Scale down. + if (4 * dst_width == 3 * src_width && + 4 * dst_height == 3 * src_height) { + // optimized, 3/4 + ScalePlaneDown34(src_width, src_height, dst_width, dst_height, + src_stride, dst_stride, src, dst, filtering); + return; + } + if (2 * dst_width == src_width && 2 * dst_height == src_height) { + // optimized, 1/2 + ScalePlaneDown2(src_width, src_height, dst_width, dst_height, + src_stride, dst_stride, src, dst, filtering); + return; + } + // 3/8 rounded up for odd sized chroma height. + if (8 * dst_width == 3 * src_width && + dst_height == ((src_height * 3 + 7) / 8)) { + // optimized, 3/8 + ScalePlaneDown38(src_width, src_height, dst_width, dst_height, + src_stride, dst_stride, src, dst, filtering); + return; + } + if (4 * dst_width == src_width && 4 * dst_height == src_height && + filtering != kFilterBilinear) { + // optimized, 1/4 + ScalePlaneDown4(src_width, src_height, dst_width, dst_height, + src_stride, dst_stride, src, dst, filtering); + return; + } + } + if (filtering == kFilterBox && dst_height * 2 < src_height) { + ScalePlaneBox(src_width, src_height, dst_width, dst_height, + src_stride, dst_stride, src, dst); + return; + } + if (filtering && dst_height > src_height) { + ScalePlaneBilinearUp(src_width, src_height, dst_width, dst_height, + src_stride, dst_stride, src, dst, filtering); + return; + } + if (filtering) { + ScalePlaneBilinearDown(src_width, src_height, dst_width, dst_height, + src_stride, dst_stride, src, dst, filtering); + return; + } + ScalePlaneSimple(src_width, src_height, dst_width, dst_height, + src_stride, dst_stride, src, dst); +} + +LIBYUV_API +void ScalePlane_16(const uint16* src, int src_stride, + int src_width, int src_height, + uint16* dst, int dst_stride, + int dst_width, int dst_height, + enum FilterMode filtering) { + // Simplify filtering when possible. + filtering = ScaleFilterReduce(src_width, src_height, + dst_width, dst_height, + filtering); + + // Negative height means invert the image. + if (src_height < 0) { + src_height = -src_height; + src = src + (src_height - 1) * src_stride; + src_stride = -src_stride; + } + + // Use specialized scales to improve performance for common resolutions. + // For example, all the 1/2 scalings will use ScalePlaneDown2() + if (dst_width == src_width && dst_height == src_height) { + // Straight copy. + CopyPlane_16(src, src_stride, dst, dst_stride, dst_width, dst_height); + return; + } + if (dst_width == src_width) { + int dy = FixedDiv(src_height, dst_height); + // Arbitrary scale vertically, but unscaled vertically. + ScalePlaneVertical_16(src_height, + dst_width, dst_height, + src_stride, dst_stride, src, dst, + 0, 0, dy, 1, filtering); + return; + } + if (dst_width <= Abs(src_width) && dst_height <= src_height) { + // Scale down. + if (4 * dst_width == 3 * src_width && + 4 * dst_height == 3 * src_height) { + // optimized, 3/4 + ScalePlaneDown34_16(src_width, src_height, dst_width, dst_height, + src_stride, dst_stride, src, dst, filtering); + return; + } + if (2 * dst_width == src_width && 2 * dst_height == src_height) { + // optimized, 1/2 + ScalePlaneDown2_16(src_width, src_height, dst_width, dst_height, + src_stride, dst_stride, src, dst, filtering); + return; + } + // 3/8 rounded up for odd sized chroma height. + if (8 * dst_width == 3 * src_width && + dst_height == ((src_height * 3 + 7) / 8)) { + // optimized, 3/8 + ScalePlaneDown38_16(src_width, src_height, dst_width, dst_height, + src_stride, dst_stride, src, dst, filtering); + return; + } + if (4 * dst_width == src_width && 4 * dst_height == src_height && + filtering != kFilterBilinear) { + // optimized, 1/4 + ScalePlaneDown4_16(src_width, src_height, dst_width, dst_height, + src_stride, dst_stride, src, dst, filtering); + return; + } + } + if (filtering == kFilterBox && dst_height * 2 < src_height) { + ScalePlaneBox_16(src_width, src_height, dst_width, dst_height, + src_stride, dst_stride, src, dst); + return; + } + if (filtering && dst_height > src_height) { + ScalePlaneBilinearUp_16(src_width, src_height, dst_width, dst_height, + src_stride, dst_stride, src, dst, filtering); + return; + } + if (filtering) { + ScalePlaneBilinearDown_16(src_width, src_height, dst_width, dst_height, + src_stride, dst_stride, src, dst, filtering); + return; + } + ScalePlaneSimple_16(src_width, src_height, dst_width, dst_height, + src_stride, dst_stride, src, dst); +} + +// Scale an I420 image. +// This function in turn calls a scaling function for each plane. + +LIBYUV_API +int I420Scale(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + int src_width, int src_height, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int dst_width, int dst_height, + enum FilterMode filtering) { + int src_halfwidth = SUBSAMPLE(src_width, 1, 1); + int src_halfheight = SUBSAMPLE(src_height, 1, 1); + int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1); + int dst_halfheight = SUBSAMPLE(dst_height, 1, 1); + if (!src_y || !src_u || !src_v || src_width == 0 || src_height == 0 || + !dst_y || !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) { + return -1; + } + + ScalePlane(src_y, src_stride_y, src_width, src_height, + dst_y, dst_stride_y, dst_width, dst_height, + filtering); + ScalePlane(src_u, src_stride_u, src_halfwidth, src_halfheight, + dst_u, dst_stride_u, dst_halfwidth, dst_halfheight, + filtering); + ScalePlane(src_v, src_stride_v, src_halfwidth, src_halfheight, + dst_v, dst_stride_v, dst_halfwidth, dst_halfheight, + filtering); + return 0; +} + +LIBYUV_API +int I420Scale_16(const uint16* src_y, int src_stride_y, + const uint16* src_u, int src_stride_u, + const uint16* src_v, int src_stride_v, + int src_width, int src_height, + uint16* dst_y, int dst_stride_y, + uint16* dst_u, int dst_stride_u, + uint16* dst_v, int dst_stride_v, + int dst_width, int dst_height, + enum FilterMode filtering) { + int src_halfwidth = SUBSAMPLE(src_width, 1, 1); + int src_halfheight = SUBSAMPLE(src_height, 1, 1); + int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1); + int dst_halfheight = SUBSAMPLE(dst_height, 1, 1); + if (!src_y || !src_u || !src_v || src_width == 0 || src_height == 0 || + !dst_y || !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) { + return -1; + } + + ScalePlane_16(src_y, src_stride_y, src_width, src_height, + dst_y, dst_stride_y, dst_width, dst_height, + filtering); + ScalePlane_16(src_u, src_stride_u, src_halfwidth, src_halfheight, + dst_u, dst_stride_u, dst_halfwidth, dst_halfheight, + filtering); + ScalePlane_16(src_v, src_stride_v, src_halfwidth, src_halfheight, + dst_v, dst_stride_v, dst_halfwidth, dst_halfheight, + filtering); + return 0; +} + +// Deprecated api +LIBYUV_API +int Scale(const uint8* src_y, const uint8* src_u, const uint8* src_v, + int src_stride_y, int src_stride_u, int src_stride_v, + int src_width, int src_height, + uint8* dst_y, uint8* dst_u, uint8* dst_v, + int dst_stride_y, int dst_stride_u, int dst_stride_v, + int dst_width, int dst_height, + LIBYUV_BOOL interpolate) { + return I420Scale(src_y, src_stride_y, + src_u, src_stride_u, + src_v, src_stride_v, + src_width, src_height, + dst_y, dst_stride_y, + dst_u, dst_stride_u, + dst_v, dst_stride_v, + dst_width, dst_height, + interpolate ? kFilterBox : kFilterNone); +} + +// Deprecated api +LIBYUV_API +int ScaleOffset(const uint8* src, int src_width, int src_height, + uint8* dst, int dst_width, int dst_height, int dst_yoffset, + LIBYUV_BOOL interpolate) { + // Chroma requires offset to multiple of 2. + int dst_yoffset_even = dst_yoffset & ~1; + int src_halfwidth = SUBSAMPLE(src_width, 1, 1); + int src_halfheight = SUBSAMPLE(src_height, 1, 1); + int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1); + int dst_halfheight = SUBSAMPLE(dst_height, 1, 1); + int aheight = dst_height - dst_yoffset_even * 2; // actual output height + const uint8* src_y = src; + const uint8* src_u = src + src_width * src_height; + const uint8* src_v = src + src_width * src_height + + src_halfwidth * src_halfheight; + uint8* dst_y = dst + dst_yoffset_even * dst_width; + uint8* dst_u = dst + dst_width * dst_height + + (dst_yoffset_even >> 1) * dst_halfwidth; + uint8* dst_v = dst + dst_width * dst_height + dst_halfwidth * dst_halfheight + + (dst_yoffset_even >> 1) * dst_halfwidth; + if (!src || src_width <= 0 || src_height <= 0 || + !dst || dst_width <= 0 || dst_height <= 0 || dst_yoffset_even < 0 || + dst_yoffset_even >= dst_height) { + return -1; + } + return I420Scale(src_y, src_width, + src_u, src_halfwidth, + src_v, src_halfwidth, + src_width, src_height, + dst_y, dst_width, + dst_u, dst_halfwidth, + dst_v, dst_halfwidth, + dst_width, aheight, + interpolate ? kFilterBox : kFilterNone); +} + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/TMessagesProj/jni/libyuv/source/scale_argb.cc b/TMessagesProj/jni/libyuv/source/scale_argb.cc new file mode 100644 index 000000000..e339cd7c7 --- /dev/null +++ b/TMessagesProj/jni/libyuv/source/scale_argb.cc @@ -0,0 +1,809 @@ +/* + * Copyright 2011 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/scale.h" + +#include +#include + +#include "libyuv/cpu_id.h" +#include "libyuv/planar_functions.h" // For CopyARGB +#include "libyuv/row.h" +#include "libyuv/scale_row.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +static __inline int Abs(int v) { + return v >= 0 ? v : -v; +} + +// ScaleARGB ARGB, 1/2 +// This is an optimized version for scaling down a ARGB to 1/2 of +// its original size. +static void ScaleARGBDown2(int src_width, int src_height, + int dst_width, int dst_height, + int src_stride, int dst_stride, + const uint8* src_argb, uint8* dst_argb, + int x, int dx, int y, int dy, + enum FilterMode filtering) { + int j; + int row_stride = src_stride * (dy >> 16); + void (*ScaleARGBRowDown2)(const uint8* src_argb, ptrdiff_t src_stride, + uint8* dst_argb, int dst_width) = + filtering == kFilterNone ? ScaleARGBRowDown2_C : + (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_C : + ScaleARGBRowDown2Box_C); + assert(dx == 65536 * 2); // Test scale factor of 2. + assert((dy & 0x1ffff) == 0); // Test vertical scale is multiple of 2. + // Advance to odd row, even column. + if (filtering == kFilterBilinear) { + src_argb += (y >> 16) * src_stride + (x >> 16) * 4; + } else { + src_argb += (y >> 16) * src_stride + ((x >> 16) - 1) * 4; + } + +#if defined(HAS_SCALEARGBROWDOWN2_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 4) && + IS_ALIGNED(src_argb, 16) && IS_ALIGNED(row_stride, 16) && + IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) { + ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_SSE2 : + (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_SSE2 : + ScaleARGBRowDown2Box_SSE2); + } +#elif defined(HAS_SCALEARGBROWDOWN2_NEON) + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 8) && + IS_ALIGNED(src_argb, 4) && IS_ALIGNED(row_stride, 4)) { + ScaleARGBRowDown2 = filtering ? ScaleARGBRowDown2Box_NEON : + ScaleARGBRowDown2_NEON; + } +#endif + + if (filtering == kFilterLinear) { + src_stride = 0; + } + for (j = 0; j < dst_height; ++j) { + ScaleARGBRowDown2(src_argb, src_stride, dst_argb, dst_width); + src_argb += row_stride; + dst_argb += dst_stride; + } +} + +// ScaleARGB ARGB, 1/4 +// This is an optimized version for scaling down a ARGB to 1/4 of +// its original size. +static void ScaleARGBDown4Box(int src_width, int src_height, + int dst_width, int dst_height, + int src_stride, int dst_stride, + const uint8* src_argb, uint8* dst_argb, + int x, int dx, int y, int dy) { + int j; + // Allocate 2 rows of ARGB. + const int kRowSize = (dst_width * 2 * 4 + 15) & ~15; + align_buffer_64(row, kRowSize * 2); + int row_stride = src_stride * (dy >> 16); + void (*ScaleARGBRowDown2)(const uint8* src_argb, ptrdiff_t src_stride, + uint8* dst_argb, int dst_width) = ScaleARGBRowDown2Box_C; + // Advance to odd row, even column. + src_argb += (y >> 16) * src_stride + (x >> 16) * 4; + assert(dx == 65536 * 4); // Test scale factor of 4. + assert((dy & 0x3ffff) == 0); // Test vertical scale is multiple of 4. +#if defined(HAS_SCALEARGBROWDOWN2_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 4) && + IS_ALIGNED(src_argb, 16) && IS_ALIGNED(row_stride, 16) && + IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) { + ScaleARGBRowDown2 = ScaleARGBRowDown2Box_SSE2; + } +#elif defined(HAS_SCALEARGBROWDOWN2_NEON) + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 8) && + IS_ALIGNED(src_argb, 4) && IS_ALIGNED(row_stride, 4)) { + ScaleARGBRowDown2 = ScaleARGBRowDown2Box_NEON; + } +#endif + for (j = 0; j < dst_height; ++j) { + ScaleARGBRowDown2(src_argb, src_stride, row, dst_width * 2); + ScaleARGBRowDown2(src_argb + src_stride * 2, src_stride, + row + kRowSize, dst_width * 2); + ScaleARGBRowDown2(row, kRowSize, dst_argb, dst_width); + src_argb += row_stride; + dst_argb += dst_stride; + } + free_aligned_buffer_64(row); +} + +// ScaleARGB ARGB Even +// This is an optimized version for scaling down a ARGB to even +// multiple of its original size. +static void ScaleARGBDownEven(int src_width, int src_height, + int dst_width, int dst_height, + int src_stride, int dst_stride, + const uint8* src_argb, uint8* dst_argb, + int x, int dx, int y, int dy, + enum FilterMode filtering) { + int j; + int col_step = dx >> 16; + int row_stride = (dy >> 16) * src_stride; + void (*ScaleARGBRowDownEven)(const uint8* src_argb, ptrdiff_t src_stride, + int src_step, uint8* dst_argb, int dst_width) = + filtering ? ScaleARGBRowDownEvenBox_C : ScaleARGBRowDownEven_C; + assert(IS_ALIGNED(src_width, 2)); + assert(IS_ALIGNED(src_height, 2)); + src_argb += (y >> 16) * src_stride + (x >> 16) * 4; +#if defined(HAS_SCALEARGBROWDOWNEVEN_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 4) && + IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) { + ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_SSE2 : + ScaleARGBRowDownEven_SSE2; + } +#elif defined(HAS_SCALEARGBROWDOWNEVEN_NEON) + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 4) && + IS_ALIGNED(src_argb, 4)) { + ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_NEON : + ScaleARGBRowDownEven_NEON; + } +#endif + + if (filtering == kFilterLinear) { + src_stride = 0; + } + for (j = 0; j < dst_height; ++j) { + ScaleARGBRowDownEven(src_argb, src_stride, col_step, dst_argb, dst_width); + src_argb += row_stride; + dst_argb += dst_stride; + } +} + +// Scale ARGB down with bilinear interpolation. +static void ScaleARGBBilinearDown(int src_width, int src_height, + int dst_width, int dst_height, + int src_stride, int dst_stride, + const uint8* src_argb, uint8* dst_argb, + int x, int dx, int y, int dy, + enum FilterMode filtering) { + int j; + void (*InterpolateRow)(uint8* dst_argb, const uint8* src_argb, + ptrdiff_t src_stride, int dst_width, int source_y_fraction) = + InterpolateRow_C; + void (*ScaleARGBFilterCols)(uint8* dst_argb, const uint8* src_argb, + int dst_width, int x, int dx) = + (src_width >= 32768) ? ScaleARGBFilterCols64_C : ScaleARGBFilterCols_C; + int64 xlast = x + (int64)(dst_width - 1) * dx; + int64 xl = (dx >= 0) ? x : xlast; + int64 xr = (dx >= 0) ? xlast : x; + int clip_src_width; + xl = (xl >> 16) & ~3; // Left edge aligned. + xr = (xr >> 16) + 1; // Right most pixel used. Bilinear uses 2 pixels. + xr = (xr + 1 + 3) & ~3; // 1 beyond 4 pixel aligned right most pixel. + if (xr > src_width) { + xr = src_width; + } + clip_src_width = (int)(xr - xl) * 4; // Width aligned to 4. + src_argb += xl * 4; + x -= (int)(xl << 16); +#if defined(HAS_INTERPOLATEROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && clip_src_width >= 16) { + InterpolateRow = InterpolateRow_Any_SSE2; + if (IS_ALIGNED(clip_src_width, 16)) { + InterpolateRow = InterpolateRow_Unaligned_SSE2; + if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16)) { + InterpolateRow = InterpolateRow_SSE2; + } + } + } +#endif +#if defined(HAS_INTERPOLATEROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && clip_src_width >= 16) { + InterpolateRow = InterpolateRow_Any_SSSE3; + if (IS_ALIGNED(clip_src_width, 16)) { + InterpolateRow = InterpolateRow_Unaligned_SSSE3; + if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16)) { + InterpolateRow = InterpolateRow_SSSE3; + } + } + } +#endif +#if defined(HAS_INTERPOLATEROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2) && clip_src_width >= 32) { + InterpolateRow = InterpolateRow_Any_AVX2; + if (IS_ALIGNED(clip_src_width, 32)) { + InterpolateRow = InterpolateRow_AVX2; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && clip_src_width >= 16) { + InterpolateRow = InterpolateRow_Any_NEON; + if (IS_ALIGNED(clip_src_width, 16)) { + InterpolateRow = InterpolateRow_NEON; + } + } +#endif +#if defined(HAS_INTERPOLATEROWS_MIPS_DSPR2) + if (TestCpuFlag(kCpuHasMIPS_DSPR2) && clip_src_width >= 4 && + IS_ALIGNED(src_argb, 4) && IS_ALIGNED(src_stride, 4)) { + InterpolateRow = InterpolateRow_Any_MIPS_DSPR2; + if (IS_ALIGNED(clip_src_width, 4)) { + InterpolateRow = InterpolateRow_MIPS_DSPR2; + } + } +#endif +#if defined(HAS_SCALEARGBFILTERCOLS_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) { + ScaleARGBFilterCols = ScaleARGBFilterCols_SSSE3; + } +#endif + // TODO(fbarchard): Consider not allocating row buffer for kFilterLinear. + // Allocate a row of ARGB. + { + align_buffer_64(row, clip_src_width * 4); + + const int max_y = (src_height - 1) << 16; + if (y > max_y) { + y = max_y; + } + for (j = 0; j < dst_height; ++j) { + int yi = y >> 16; + const uint8* src = src_argb + yi * src_stride; + if (filtering == kFilterLinear) { + ScaleARGBFilterCols(dst_argb, src, dst_width, x, dx); + } else { + int yf = (y >> 8) & 255; + InterpolateRow(row, src, src_stride, clip_src_width, yf); + ScaleARGBFilterCols(dst_argb, row, dst_width, x, dx); + } + dst_argb += dst_stride; + y += dy; + if (y > max_y) { + y = max_y; + } + } + free_aligned_buffer_64(row); + } +} + +// Scale ARGB up with bilinear interpolation. +static void ScaleARGBBilinearUp(int src_width, int src_height, + int dst_width, int dst_height, + int src_stride, int dst_stride, + const uint8* src_argb, uint8* dst_argb, + int x, int dx, int y, int dy, + enum FilterMode filtering) { + int j; + void (*InterpolateRow)(uint8* dst_argb, const uint8* src_argb, + ptrdiff_t src_stride, int dst_width, int source_y_fraction) = + InterpolateRow_C; + void (*ScaleARGBFilterCols)(uint8* dst_argb, const uint8* src_argb, + int dst_width, int x, int dx) = + filtering ? ScaleARGBFilterCols_C : ScaleARGBCols_C; + const int max_y = (src_height - 1) << 16; +#if defined(HAS_INTERPOLATEROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && dst_width >= 4) { + InterpolateRow = InterpolateRow_Any_SSE2; + if (IS_ALIGNED(dst_width, 4)) { + InterpolateRow = InterpolateRow_Unaligned_SSE2; + if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) { + InterpolateRow = InterpolateRow_SSE2; + } + } + } +#endif +#if defined(HAS_INTERPOLATEROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && dst_width >= 4) { + InterpolateRow = InterpolateRow_Any_SSSE3; + if (IS_ALIGNED(dst_width, 4)) { + InterpolateRow = InterpolateRow_Unaligned_SSSE3; + if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) { + InterpolateRow = InterpolateRow_SSSE3; + } + } + } +#endif +#if defined(HAS_INTERPOLATEROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2) && dst_width >= 8) { + InterpolateRow = InterpolateRow_Any_AVX2; + if (IS_ALIGNED(dst_width, 8)) { + InterpolateRow = InterpolateRow_AVX2; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && dst_width >= 4) { + InterpolateRow = InterpolateRow_Any_NEON; + if (IS_ALIGNED(dst_width, 4)) { + InterpolateRow = InterpolateRow_NEON; + } + } +#endif +#if defined(HAS_INTERPOLATEROWS_MIPS_DSPR2) + if (TestCpuFlag(kCpuHasMIPS_DSPR2) && dst_width >= 1 && + IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride, 4)) { + InterpolateRow = InterpolateRow_MIPS_DSPR2; + } +#endif + if (src_width >= 32768) { + ScaleARGBFilterCols = filtering ? + ScaleARGBFilterCols64_C : ScaleARGBCols64_C; + } +#if defined(HAS_SCALEARGBFILTERCOLS_SSSE3) + if (filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) { + ScaleARGBFilterCols = ScaleARGBFilterCols_SSSE3; + } +#endif +#if defined(HAS_SCALEARGBCOLS_SSE2) + if (!filtering && TestCpuFlag(kCpuHasSSE2) && src_width < 32768) { + ScaleARGBFilterCols = ScaleARGBCols_SSE2; + } +#endif + if (!filtering && src_width * 2 == dst_width && x < 0x8000) { + ScaleARGBFilterCols = ScaleARGBColsUp2_C; +#if defined(HAS_SCALEARGBCOLSUP2_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8) && + IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16) && + IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) { + ScaleARGBFilterCols = ScaleARGBColsUp2_SSE2; + } +#endif + } + + if (y > max_y) { + y = max_y; + } + + { + int yi = y >> 16; + const uint8* src = src_argb + yi * src_stride; + + // Allocate 2 rows of ARGB. + const int kRowSize = (dst_width * 4 + 15) & ~15; + align_buffer_64(row, kRowSize * 2); + + uint8* rowptr = row; + int rowstride = kRowSize; + int lasty = yi; + + ScaleARGBFilterCols(rowptr, src, dst_width, x, dx); + if (src_height > 1) { + src += src_stride; + } + ScaleARGBFilterCols(rowptr + rowstride, src, dst_width, x, dx); + src += src_stride; + + for (j = 0; j < dst_height; ++j) { + yi = y >> 16; + if (yi != lasty) { + if (y > max_y) { + y = max_y; + yi = y >> 16; + src = src_argb + yi * src_stride; + } + if (yi != lasty) { + ScaleARGBFilterCols(rowptr, src, dst_width, x, dx); + rowptr += rowstride; + rowstride = -rowstride; + lasty = yi; + src += src_stride; + } + } + if (filtering == kFilterLinear) { + InterpolateRow(dst_argb, rowptr, 0, dst_width * 4, 0); + } else { + int yf = (y >> 8) & 255; + InterpolateRow(dst_argb, rowptr, rowstride, dst_width * 4, yf); + } + dst_argb += dst_stride; + y += dy; + } + free_aligned_buffer_64(row); + } +} + +#ifdef YUVSCALEUP +// Scale YUV to ARGB up with bilinear interpolation. +static void ScaleYUVToARGBBilinearUp(int src_width, int src_height, + int dst_width, int dst_height, + int src_stride_y, + int src_stride_u, + int src_stride_v, + int dst_stride_argb, + const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int x, int dx, int y, int dy, + enum FilterMode filtering) { + int j; + void (*I422ToARGBRow)(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width) = I422ToARGBRow_C; +#if defined(HAS_I422TOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && src_width >= 8) { + I422ToARGBRow = I422ToARGBRow_Any_SSSE3; + if (IS_ALIGNED(src_width, 8)) { + I422ToARGBRow = I422ToARGBRow_Unaligned_SSSE3; + if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + I422ToARGBRow = I422ToARGBRow_SSSE3; + } + } + } +#endif +#if defined(HAS_I422TOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2) && src_width >= 16) { + I422ToARGBRow = I422ToARGBRow_Any_AVX2; + if (IS_ALIGNED(src_width, 16)) { + I422ToARGBRow = I422ToARGBRow_AVX2; + } + } +#endif +#if defined(HAS_I422TOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && src_width >= 8) { + I422ToARGBRow = I422ToARGBRow_Any_NEON; + if (IS_ALIGNED(src_width, 8)) { + I422ToARGBRow = I422ToARGBRow_NEON; + } + } +#endif +#if defined(HAS_I422TOARGBROW_MIPS_DSPR2) + if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(src_width, 4) && + IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) && + IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) && + IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) && + IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) { + I422ToARGBRow = I422ToARGBRow_MIPS_DSPR2; + } +#endif + + void (*InterpolateRow)(uint8* dst_argb, const uint8* src_argb, + ptrdiff_t src_stride, int dst_width, int source_y_fraction) = + InterpolateRow_C; +#if defined(HAS_INTERPOLATEROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && dst_width >= 4) { + InterpolateRow = InterpolateRow_Any_SSE2; + if (IS_ALIGNED(dst_width, 4)) { + InterpolateRow = InterpolateRow_Unaligned_SSE2; + if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + InterpolateRow = InterpolateRow_SSE2; + } + } + } +#endif +#if defined(HAS_INTERPOLATEROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && dst_width >= 4) { + InterpolateRow = InterpolateRow_Any_SSSE3; + if (IS_ALIGNED(dst_width, 4)) { + InterpolateRow = InterpolateRow_Unaligned_SSSE3; + if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + InterpolateRow = InterpolateRow_SSSE3; + } + } + } +#endif +#if defined(HAS_INTERPOLATEROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2) && dst_width >= 8) { + InterpolateRow = InterpolateRow_Any_AVX2; + if (IS_ALIGNED(dst_width, 8)) { + InterpolateRow = InterpolateRow_AVX2; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && dst_width >= 4) { + InterpolateRow = InterpolateRow_Any_NEON; + if (IS_ALIGNED(dst_width, 4)) { + InterpolateRow = InterpolateRow_NEON; + } + } +#endif +#if defined(HAS_INTERPOLATEROWS_MIPS_DSPR2) + if (TestCpuFlag(kCpuHasMIPS_DSPR2) && dst_width >= 1 && + IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) { + InterpolateRow = InterpolateRow_MIPS_DSPR2; + } +#endif + + void (*ScaleARGBFilterCols)(uint8* dst_argb, const uint8* src_argb, + int dst_width, int x, int dx) = + filtering ? ScaleARGBFilterCols_C : ScaleARGBCols_C; + if (src_width >= 32768) { + ScaleARGBFilterCols = filtering ? + ScaleARGBFilterCols64_C : ScaleARGBCols64_C; + } +#if defined(HAS_SCALEARGBFILTERCOLS_SSSE3) + if (filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) { + ScaleARGBFilterCols = ScaleARGBFilterCols_SSSE3; + } +#endif +#if defined(HAS_SCALEARGBCOLS_SSE2) + if (!filtering && TestCpuFlag(kCpuHasSSE2) && src_width < 32768) { + ScaleARGBFilterCols = ScaleARGBCols_SSE2; + } +#endif + if (!filtering && src_width * 2 == dst_width && x < 0x8000) { + ScaleARGBFilterCols = ScaleARGBColsUp2_C; +#if defined(HAS_SCALEARGBCOLSUP2_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8) && + IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16) && + IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) { + ScaleARGBFilterCols = ScaleARGBColsUp2_SSE2; + } +#endif + } + + const int max_y = (src_height - 1) << 16; + if (y > max_y) { + y = max_y; + } + const int kYShift = 1; // Shift Y by 1 to convert Y plane to UV coordinate. + int yi = y >> 16; + int uv_yi = yi >> kYShift; + const uint8* src_row_y = src_y + yi * src_stride_y; + const uint8* src_row_u = src_u + uv_yi * src_stride_u; + const uint8* src_row_v = src_v + uv_yi * src_stride_v; + + // Allocate 2 rows of ARGB. + const int kRowSize = (dst_width * 4 + 15) & ~15; + align_buffer_64(row, kRowSize * 2); + + // Allocate 1 row of ARGB for source conversion. + align_buffer_64(argb_row, src_width * 4); + + uint8* rowptr = row; + int rowstride = kRowSize; + int lasty = yi; + + // TODO(fbarchard): Convert first 2 rows of YUV to ARGB. + ScaleARGBFilterCols(rowptr, src_row_y, dst_width, x, dx); + if (src_height > 1) { + src_row_y += src_stride_y; + if (yi & 1) { + src_row_u += src_stride_u; + src_row_v += src_stride_v; + } + } + ScaleARGBFilterCols(rowptr + rowstride, src_row_y, dst_width, x, dx); + if (src_height > 2) { + src_row_y += src_stride_y; + if (!(yi & 1)) { + src_row_u += src_stride_u; + src_row_v += src_stride_v; + } + } + + for (j = 0; j < dst_height; ++j) { + yi = y >> 16; + if (yi != lasty) { + if (y > max_y) { + y = max_y; + yi = y >> 16; + uv_yi = yi >> kYShift; + src_row_y = src_y + yi * src_stride_y; + src_row_u = src_u + uv_yi * src_stride_u; + src_row_v = src_v + uv_yi * src_stride_v; + } + if (yi != lasty) { + // TODO(fbarchard): Convert the clipped region of row. + I422ToARGBRow(src_row_y, src_row_u, src_row_v, argb_row, src_width); + ScaleARGBFilterCols(rowptr, argb_row, dst_width, x, dx); + rowptr += rowstride; + rowstride = -rowstride; + lasty = yi; + src_row_y += src_stride_y; + if (yi & 1) { + src_row_u += src_stride_u; + src_row_v += src_stride_v; + } + } + } + if (filtering == kFilterLinear) { + InterpolateRow(dst_argb, rowptr, 0, dst_width * 4, 0); + } else { + int yf = (y >> 8) & 255; + InterpolateRow(dst_argb, rowptr, rowstride, dst_width * 4, yf); + } + dst_argb += dst_stride_argb; + y += dy; + } + free_aligned_buffer_64(row); + free_aligned_buffer_64(row_argb); +} +#endif + +// Scale ARGB to/from any dimensions, without interpolation. +// Fixed point math is used for performance: The upper 16 bits +// of x and dx is the integer part of the source position and +// the lower 16 bits are the fixed decimal part. + +static void ScaleARGBSimple(int src_width, int src_height, + int dst_width, int dst_height, + int src_stride, int dst_stride, + const uint8* src_argb, uint8* dst_argb, + int x, int dx, int y, int dy) { + int j; + void (*ScaleARGBCols)(uint8* dst_argb, const uint8* src_argb, + int dst_width, int x, int dx) = + (src_width >= 32768) ? ScaleARGBCols64_C : ScaleARGBCols_C; +#if defined(HAS_SCALEARGBCOLS_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && src_width < 32768) { + ScaleARGBCols = ScaleARGBCols_SSE2; + } +#endif + if (src_width * 2 == dst_width && x < 0x8000) { + ScaleARGBCols = ScaleARGBColsUp2_C; +#if defined(HAS_SCALEARGBCOLSUP2_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8) && + IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16) && + IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) { + ScaleARGBCols = ScaleARGBColsUp2_SSE2; + } +#endif + } + + for (j = 0; j < dst_height; ++j) { + ScaleARGBCols(dst_argb, src_argb + (y >> 16) * src_stride, + dst_width, x, dx); + dst_argb += dst_stride; + y += dy; + } +} + +// ScaleARGB a ARGB. +// This function in turn calls a scaling function +// suitable for handling the desired resolutions. +static void ScaleARGB(const uint8* src, int src_stride, + int src_width, int src_height, + uint8* dst, int dst_stride, + int dst_width, int dst_height, + int clip_x, int clip_y, int clip_width, int clip_height, + enum FilterMode filtering) { + // Initial source x/y coordinate and step values as 16.16 fixed point. + int x = 0; + int y = 0; + int dx = 0; + int dy = 0; + // ARGB does not support box filter yet, but allow the user to pass it. + // Simplify filtering when possible. + filtering = ScaleFilterReduce(src_width, src_height, + dst_width, dst_height, + filtering); + + // Negative src_height means invert the image. + if (src_height < 0) { + src_height = -src_height; + src = src + (src_height - 1) * src_stride; + src_stride = -src_stride; + } + ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, + &x, &y, &dx, &dy); + src_width = Abs(src_width); + if (clip_x) { + int64 clipf = (int64)(clip_x) * dx; + x += (clipf & 0xffff); + src += (clipf >> 16) * 4; + dst += clip_x * 4; + } + if (clip_y) { + int64 clipf = (int64)(clip_y) * dy; + y += (clipf & 0xffff); + src += (clipf >> 16) * src_stride; + dst += clip_y * dst_stride; + } + + // Special case for integer step values. + if (((dx | dy) & 0xffff) == 0) { + if (!dx || !dy) { // 1 pixel wide and/or tall. + filtering = kFilterNone; + } else { + // Optimized even scale down. ie 2, 4, 6, 8, 10x. + if (!(dx & 0x10000) && !(dy & 0x10000)) { + if (dx == 0x20000) { + // Optimized 1/2 downsample. + ScaleARGBDown2(src_width, src_height, + clip_width, clip_height, + src_stride, dst_stride, src, dst, + x, dx, y, dy, filtering); + return; + } + if (dx == 0x40000 && filtering == kFilterBox) { + // Optimized 1/4 box downsample. + ScaleARGBDown4Box(src_width, src_height, + clip_width, clip_height, + src_stride, dst_stride, src, dst, + x, dx, y, dy); + return; + } + ScaleARGBDownEven(src_width, src_height, + clip_width, clip_height, + src_stride, dst_stride, src, dst, + x, dx, y, dy, filtering); + return; + } + // Optimized odd scale down. ie 3, 5, 7, 9x. + if ((dx & 0x10000) && (dy & 0x10000)) { + filtering = kFilterNone; + if (dx == 0x10000 && dy == 0x10000) { + // Straight copy. + ARGBCopy(src + (y >> 16) * src_stride + (x >> 16) * 4, src_stride, + dst, dst_stride, clip_width, clip_height); + return; + } + } + } + } + if (dx == 0x10000 && (x & 0xffff) == 0) { + // Arbitrary scale vertically, but unscaled vertically. + ScalePlaneVertical(src_height, + clip_width, clip_height, + src_stride, dst_stride, src, dst, + x, y, dy, 4, filtering); + return; + } + if (filtering && dy < 65536) { + ScaleARGBBilinearUp(src_width, src_height, + clip_width, clip_height, + src_stride, dst_stride, src, dst, + x, dx, y, dy, filtering); + return; + } + if (filtering) { + ScaleARGBBilinearDown(src_width, src_height, + clip_width, clip_height, + src_stride, dst_stride, src, dst, + x, dx, y, dy, filtering); + return; + } + ScaleARGBSimple(src_width, src_height, clip_width, clip_height, + src_stride, dst_stride, src, dst, + x, dx, y, dy); +} + +LIBYUV_API +int ARGBScaleClip(const uint8* src_argb, int src_stride_argb, + int src_width, int src_height, + uint8* dst_argb, int dst_stride_argb, + int dst_width, int dst_height, + int clip_x, int clip_y, int clip_width, int clip_height, + enum FilterMode filtering) { + if (!src_argb || src_width == 0 || src_height == 0 || + !dst_argb || dst_width <= 0 || dst_height <= 0 || + clip_x < 0 || clip_y < 0 || + (clip_x + clip_width) > dst_width || + (clip_y + clip_height) > dst_height) { + return -1; + } + ScaleARGB(src_argb, src_stride_argb, src_width, src_height, + dst_argb, dst_stride_argb, dst_width, dst_height, + clip_x, clip_y, clip_width, clip_height, filtering); + return 0; +} + +// Scale an ARGB image. +LIBYUV_API +int ARGBScale(const uint8* src_argb, int src_stride_argb, + int src_width, int src_height, + uint8* dst_argb, int dst_stride_argb, + int dst_width, int dst_height, + enum FilterMode filtering) { + if (!src_argb || src_width == 0 || src_height == 0 || + !dst_argb || dst_width <= 0 || dst_height <= 0) { + return -1; + } + ScaleARGB(src_argb, src_stride_argb, src_width, src_height, + dst_argb, dst_stride_argb, dst_width, dst_height, + 0, 0, dst_width, dst_height, filtering); + return 0; +} + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/TMessagesProj/jni/libyuv/source/scale_common.cc b/TMessagesProj/jni/libyuv/source/scale_common.cc new file mode 100644 index 000000000..e4b2acc41 --- /dev/null +++ b/TMessagesProj/jni/libyuv/source/scale_common.cc @@ -0,0 +1,1165 @@ +/* + * Copyright 2013 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/scale.h" + +#include +#include + +#include "libyuv/cpu_id.h" +#include "libyuv/planar_functions.h" // For CopyARGB +#include "libyuv/row.h" +#include "libyuv/scale_row.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +static __inline int Abs(int v) { + return v >= 0 ? v : -v; +} + +// CPU agnostic row functions +void ScaleRowDown2_C(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width) { + int x; + for (x = 0; x < dst_width - 1; x += 2) { + dst[0] = src_ptr[1]; + dst[1] = src_ptr[3]; + dst += 2; + src_ptr += 4; + } + if (dst_width & 1) { + dst[0] = src_ptr[1]; + } +} + +void ScaleRowDown2_16_C(const uint16* src_ptr, ptrdiff_t src_stride, + uint16* dst, int dst_width) { + int x; + for (x = 0; x < dst_width - 1; x += 2) { + dst[0] = src_ptr[1]; + dst[1] = src_ptr[3]; + dst += 2; + src_ptr += 4; + } + if (dst_width & 1) { + dst[0] = src_ptr[1]; + } +} + +void ScaleRowDown2Linear_C(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width) { + const uint8* s = src_ptr; + int x; + for (x = 0; x < dst_width - 1; x += 2) { + dst[0] = (s[0] + s[1] + 1) >> 1; + dst[1] = (s[2] + s[3] + 1) >> 1; + dst += 2; + s += 4; + } + if (dst_width & 1) { + dst[0] = (s[0] + s[1] + 1) >> 1; + } +} + +void ScaleRowDown2Linear_16_C(const uint16* src_ptr, ptrdiff_t src_stride, + uint16* dst, int dst_width) { + const uint16* s = src_ptr; + int x; + for (x = 0; x < dst_width - 1; x += 2) { + dst[0] = (s[0] + s[1] + 1) >> 1; + dst[1] = (s[2] + s[3] + 1) >> 1; + dst += 2; + s += 4; + } + if (dst_width & 1) { + dst[0] = (s[0] + s[1] + 1) >> 1; + } +} + +void ScaleRowDown2Box_C(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width) { + const uint8* s = src_ptr; + const uint8* t = src_ptr + src_stride; + int x; + for (x = 0; x < dst_width - 1; x += 2) { + dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2; + dst[1] = (s[2] + s[3] + t[2] + t[3] + 2) >> 2; + dst += 2; + s += 4; + t += 4; + } + if (dst_width & 1) { + dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2; + } +} + +void ScaleRowDown2Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride, + uint16* dst, int dst_width) { + const uint16* s = src_ptr; + const uint16* t = src_ptr + src_stride; + int x; + for (x = 0; x < dst_width - 1; x += 2) { + dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2; + dst[1] = (s[2] + s[3] + t[2] + t[3] + 2) >> 2; + dst += 2; + s += 4; + t += 4; + } + if (dst_width & 1) { + dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2; + } +} + +void ScaleRowDown4_C(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width) { + int x; + for (x = 0; x < dst_width - 1; x += 2) { + dst[0] = src_ptr[2]; + dst[1] = src_ptr[6]; + dst += 2; + src_ptr += 8; + } + if (dst_width & 1) { + dst[0] = src_ptr[2]; + } +} + +void ScaleRowDown4_16_C(const uint16* src_ptr, ptrdiff_t src_stride, + uint16* dst, int dst_width) { + int x; + for (x = 0; x < dst_width - 1; x += 2) { + dst[0] = src_ptr[2]; + dst[1] = src_ptr[6]; + dst += 2; + src_ptr += 8; + } + if (dst_width & 1) { + dst[0] = src_ptr[2]; + } +} + +void ScaleRowDown4Box_C(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width) { + intptr_t stride = src_stride; + int x; + for (x = 0; x < dst_width - 1; x += 2) { + dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] + + src_ptr[stride + 0] + src_ptr[stride + 1] + + src_ptr[stride + 2] + src_ptr[stride + 3] + + src_ptr[stride * 2 + 0] + src_ptr[stride * 2 + 1] + + src_ptr[stride * 2 + 2] + src_ptr[stride * 2 + 3] + + src_ptr[stride * 3 + 0] + src_ptr[stride * 3 + 1] + + src_ptr[stride * 3 + 2] + src_ptr[stride * 3 + 3] + + 8) >> 4; + dst[1] = (src_ptr[4] + src_ptr[5] + src_ptr[6] + src_ptr[7] + + src_ptr[stride + 4] + src_ptr[stride + 5] + + src_ptr[stride + 6] + src_ptr[stride + 7] + + src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5] + + src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7] + + src_ptr[stride * 3 + 4] + src_ptr[stride * 3 + 5] + + src_ptr[stride * 3 + 6] + src_ptr[stride * 3 + 7] + + 8) >> 4; + dst += 2; + src_ptr += 8; + } + if (dst_width & 1) { + dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] + + src_ptr[stride + 0] + src_ptr[stride + 1] + + src_ptr[stride + 2] + src_ptr[stride + 3] + + src_ptr[stride * 2 + 0] + src_ptr[stride * 2 + 1] + + src_ptr[stride * 2 + 2] + src_ptr[stride * 2 + 3] + + src_ptr[stride * 3 + 0] + src_ptr[stride * 3 + 1] + + src_ptr[stride * 3 + 2] + src_ptr[stride * 3 + 3] + + 8) >> 4; + } +} + +void ScaleRowDown4Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride, + uint16* dst, int dst_width) { + intptr_t stride = src_stride; + int x; + for (x = 0; x < dst_width - 1; x += 2) { + dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] + + src_ptr[stride + 0] + src_ptr[stride + 1] + + src_ptr[stride + 2] + src_ptr[stride + 3] + + src_ptr[stride * 2 + 0] + src_ptr[stride * 2 + 1] + + src_ptr[stride * 2 + 2] + src_ptr[stride * 2 + 3] + + src_ptr[stride * 3 + 0] + src_ptr[stride * 3 + 1] + + src_ptr[stride * 3 + 2] + src_ptr[stride * 3 + 3] + + 8) >> 4; + dst[1] = (src_ptr[4] + src_ptr[5] + src_ptr[6] + src_ptr[7] + + src_ptr[stride + 4] + src_ptr[stride + 5] + + src_ptr[stride + 6] + src_ptr[stride + 7] + + src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5] + + src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7] + + src_ptr[stride * 3 + 4] + src_ptr[stride * 3 + 5] + + src_ptr[stride * 3 + 6] + src_ptr[stride * 3 + 7] + + 8) >> 4; + dst += 2; + src_ptr += 8; + } + if (dst_width & 1) { + dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] + + src_ptr[stride + 0] + src_ptr[stride + 1] + + src_ptr[stride + 2] + src_ptr[stride + 3] + + src_ptr[stride * 2 + 0] + src_ptr[stride * 2 + 1] + + src_ptr[stride * 2 + 2] + src_ptr[stride * 2 + 3] + + src_ptr[stride * 3 + 0] + src_ptr[stride * 3 + 1] + + src_ptr[stride * 3 + 2] + src_ptr[stride * 3 + 3] + + 8) >> 4; + } +} + +void ScaleRowDown34_C(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width) { + int x; + assert((dst_width % 3 == 0) && (dst_width > 0)); + for (x = 0; x < dst_width; x += 3) { + dst[0] = src_ptr[0]; + dst[1] = src_ptr[1]; + dst[2] = src_ptr[3]; + dst += 3; + src_ptr += 4; + } +} + +void ScaleRowDown34_16_C(const uint16* src_ptr, ptrdiff_t src_stride, + uint16* dst, int dst_width) { + int x; + assert((dst_width % 3 == 0) && (dst_width > 0)); + for (x = 0; x < dst_width; x += 3) { + dst[0] = src_ptr[0]; + dst[1] = src_ptr[1]; + dst[2] = src_ptr[3]; + dst += 3; + src_ptr += 4; + } +} + +// Filter rows 0 and 1 together, 3 : 1 +void ScaleRowDown34_0_Box_C(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* d, int dst_width) { + const uint8* s = src_ptr; + const uint8* t = src_ptr + src_stride; + int x; + assert((dst_width % 3 == 0) && (dst_width > 0)); + for (x = 0; x < dst_width; x += 3) { + uint8 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2; + uint8 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1; + uint8 a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2; + uint8 b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2; + uint8 b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1; + uint8 b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2; + d[0] = (a0 * 3 + b0 + 2) >> 2; + d[1] = (a1 * 3 + b1 + 2) >> 2; + d[2] = (a2 * 3 + b2 + 2) >> 2; + d += 3; + s += 4; + t += 4; + } +} + +void ScaleRowDown34_0_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride, + uint16* d, int dst_width) { + const uint16* s = src_ptr; + const uint16* t = src_ptr + src_stride; + int x; + assert((dst_width % 3 == 0) && (dst_width > 0)); + for (x = 0; x < dst_width; x += 3) { + uint16 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2; + uint16 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1; + uint16 a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2; + uint16 b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2; + uint16 b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1; + uint16 b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2; + d[0] = (a0 * 3 + b0 + 2) >> 2; + d[1] = (a1 * 3 + b1 + 2) >> 2; + d[2] = (a2 * 3 + b2 + 2) >> 2; + d += 3; + s += 4; + t += 4; + } +} + +// Filter rows 1 and 2 together, 1 : 1 +void ScaleRowDown34_1_Box_C(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* d, int dst_width) { + const uint8* s = src_ptr; + const uint8* t = src_ptr + src_stride; + int x; + assert((dst_width % 3 == 0) && (dst_width > 0)); + for (x = 0; x < dst_width; x += 3) { + uint8 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2; + uint8 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1; + uint8 a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2; + uint8 b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2; + uint8 b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1; + uint8 b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2; + d[0] = (a0 + b0 + 1) >> 1; + d[1] = (a1 + b1 + 1) >> 1; + d[2] = (a2 + b2 + 1) >> 1; + d += 3; + s += 4; + t += 4; + } +} + +void ScaleRowDown34_1_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride, + uint16* d, int dst_width) { + const uint16* s = src_ptr; + const uint16* t = src_ptr + src_stride; + int x; + assert((dst_width % 3 == 0) && (dst_width > 0)); + for (x = 0; x < dst_width; x += 3) { + uint16 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2; + uint16 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1; + uint16 a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2; + uint16 b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2; + uint16 b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1; + uint16 b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2; + d[0] = (a0 + b0 + 1) >> 1; + d[1] = (a1 + b1 + 1) >> 1; + d[2] = (a2 + b2 + 1) >> 1; + d += 3; + s += 4; + t += 4; + } +} + +// Scales a single row of pixels using point sampling. +void ScaleCols_C(uint8* dst_ptr, const uint8* src_ptr, + int dst_width, int x, int dx) { + int j; + for (j = 0; j < dst_width - 1; j += 2) { + dst_ptr[0] = src_ptr[x >> 16]; + x += dx; + dst_ptr[1] = src_ptr[x >> 16]; + x += dx; + dst_ptr += 2; + } + if (dst_width & 1) { + dst_ptr[0] = src_ptr[x >> 16]; + } +} + +void ScaleCols_16_C(uint16* dst_ptr, const uint16* src_ptr, + int dst_width, int x, int dx) { + int j; + for (j = 0; j < dst_width - 1; j += 2) { + dst_ptr[0] = src_ptr[x >> 16]; + x += dx; + dst_ptr[1] = src_ptr[x >> 16]; + x += dx; + dst_ptr += 2; + } + if (dst_width & 1) { + dst_ptr[0] = src_ptr[x >> 16]; + } +} + +// Scales a single row of pixels up by 2x using point sampling. +void ScaleColsUp2_C(uint8* dst_ptr, const uint8* src_ptr, + int dst_width, int x, int dx) { + int j; + for (j = 0; j < dst_width - 1; j += 2) { + dst_ptr[1] = dst_ptr[0] = src_ptr[0]; + src_ptr += 1; + dst_ptr += 2; + } + if (dst_width & 1) { + dst_ptr[0] = src_ptr[0]; + } +} + +void ScaleColsUp2_16_C(uint16* dst_ptr, const uint16* src_ptr, + int dst_width, int x, int dx) { + int j; + for (j = 0; j < dst_width - 1; j += 2) { + dst_ptr[1] = dst_ptr[0] = src_ptr[0]; + src_ptr += 1; + dst_ptr += 2; + } + if (dst_width & 1) { + dst_ptr[0] = src_ptr[0]; + } +} + +// (1-f)a + fb can be replaced with a + f(b-a) +#define BLENDER(a, b, f) (uint8)((int)(a) + \ + ((int)(f) * ((int)(b) - (int)(a)) >> 16)) + +void ScaleFilterCols_C(uint8* dst_ptr, const uint8* src_ptr, + int dst_width, int x, int dx) { + int j; + for (j = 0; j < dst_width - 1; j += 2) { + int xi = x >> 16; + int a = src_ptr[xi]; + int b = src_ptr[xi + 1]; + dst_ptr[0] = BLENDER(a, b, x & 0xffff); + x += dx; + xi = x >> 16; + a = src_ptr[xi]; + b = src_ptr[xi + 1]; + dst_ptr[1] = BLENDER(a, b, x & 0xffff); + x += dx; + dst_ptr += 2; + } + if (dst_width & 1) { + int xi = x >> 16; + int a = src_ptr[xi]; + int b = src_ptr[xi + 1]; + dst_ptr[0] = BLENDER(a, b, x & 0xffff); + } +} + +void ScaleFilterCols64_C(uint8* dst_ptr, const uint8* src_ptr, + int dst_width, int x32, int dx) { + int64 x = (int64)(x32); + int j; + for (j = 0; j < dst_width - 1; j += 2) { + int64 xi = x >> 16; + int a = src_ptr[xi]; + int b = src_ptr[xi + 1]; + dst_ptr[0] = BLENDER(a, b, x & 0xffff); + x += dx; + xi = x >> 16; + a = src_ptr[xi]; + b = src_ptr[xi + 1]; + dst_ptr[1] = BLENDER(a, b, x & 0xffff); + x += dx; + dst_ptr += 2; + } + if (dst_width & 1) { + int64 xi = x >> 16; + int a = src_ptr[xi]; + int b = src_ptr[xi + 1]; + dst_ptr[0] = BLENDER(a, b, x & 0xffff); + } +} +#undef BLENDER + +#define BLENDER(a, b, f) (uint16)((int)(a) + \ + ((int)(f) * ((int)(b) - (int)(a)) >> 16)) + +void ScaleFilterCols_16_C(uint16* dst_ptr, const uint16* src_ptr, + int dst_width, int x, int dx) { + int j; + for (j = 0; j < dst_width - 1; j += 2) { + int xi = x >> 16; + int a = src_ptr[xi]; + int b = src_ptr[xi + 1]; + dst_ptr[0] = BLENDER(a, b, x & 0xffff); + x += dx; + xi = x >> 16; + a = src_ptr[xi]; + b = src_ptr[xi + 1]; + dst_ptr[1] = BLENDER(a, b, x & 0xffff); + x += dx; + dst_ptr += 2; + } + if (dst_width & 1) { + int xi = x >> 16; + int a = src_ptr[xi]; + int b = src_ptr[xi + 1]; + dst_ptr[0] = BLENDER(a, b, x & 0xffff); + } +} + +void ScaleFilterCols64_16_C(uint16* dst_ptr, const uint16* src_ptr, + int dst_width, int x32, int dx) { + int64 x = (int64)(x32); + int j; + for (j = 0; j < dst_width - 1; j += 2) { + int64 xi = x >> 16; + int a = src_ptr[xi]; + int b = src_ptr[xi + 1]; + dst_ptr[0] = BLENDER(a, b, x & 0xffff); + x += dx; + xi = x >> 16; + a = src_ptr[xi]; + b = src_ptr[xi + 1]; + dst_ptr[1] = BLENDER(a, b, x & 0xffff); + x += dx; + dst_ptr += 2; + } + if (dst_width & 1) { + int64 xi = x >> 16; + int a = src_ptr[xi]; + int b = src_ptr[xi + 1]; + dst_ptr[0] = BLENDER(a, b, x & 0xffff); + } +} +#undef BLENDER + +void ScaleRowDown38_C(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width) { + int x; + assert(dst_width % 3 == 0); + for (x = 0; x < dst_width; x += 3) { + dst[0] = src_ptr[0]; + dst[1] = src_ptr[3]; + dst[2] = src_ptr[6]; + dst += 3; + src_ptr += 8; + } +} + +void ScaleRowDown38_16_C(const uint16* src_ptr, ptrdiff_t src_stride, + uint16* dst, int dst_width) { + int x; + assert(dst_width % 3 == 0); + for (x = 0; x < dst_width; x += 3) { + dst[0] = src_ptr[0]; + dst[1] = src_ptr[3]; + dst[2] = src_ptr[6]; + dst += 3; + src_ptr += 8; + } +} + +// 8x3 -> 3x1 +void ScaleRowDown38_3_Box_C(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + intptr_t stride = src_stride; + int i; + assert((dst_width % 3 == 0) && (dst_width > 0)); + for (i = 0; i < dst_width; i += 3) { + dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + + src_ptr[stride + 0] + src_ptr[stride + 1] + + src_ptr[stride + 2] + src_ptr[stride * 2 + 0] + + src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2]) * + (65536 / 9) >> 16; + dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] + + src_ptr[stride + 3] + src_ptr[stride + 4] + + src_ptr[stride + 5] + src_ptr[stride * 2 + 3] + + src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5]) * + (65536 / 9) >> 16; + dst_ptr[2] = (src_ptr[6] + src_ptr[7] + + src_ptr[stride + 6] + src_ptr[stride + 7] + + src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7]) * + (65536 / 6) >> 16; + src_ptr += 8; + dst_ptr += 3; + } +} + +void ScaleRowDown38_3_Box_16_C(const uint16* src_ptr, + ptrdiff_t src_stride, + uint16* dst_ptr, int dst_width) { + intptr_t stride = src_stride; + int i; + assert((dst_width % 3 == 0) && (dst_width > 0)); + for (i = 0; i < dst_width; i += 3) { + dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + + src_ptr[stride + 0] + src_ptr[stride + 1] + + src_ptr[stride + 2] + src_ptr[stride * 2 + 0] + + src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2]) * + (65536 / 9) >> 16; + dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] + + src_ptr[stride + 3] + src_ptr[stride + 4] + + src_ptr[stride + 5] + src_ptr[stride * 2 + 3] + + src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5]) * + (65536 / 9) >> 16; + dst_ptr[2] = (src_ptr[6] + src_ptr[7] + + src_ptr[stride + 6] + src_ptr[stride + 7] + + src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7]) * + (65536 / 6) >> 16; + src_ptr += 8; + dst_ptr += 3; + } +} + +// 8x2 -> 3x1 +void ScaleRowDown38_2_Box_C(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + intptr_t stride = src_stride; + int i; + assert((dst_width % 3 == 0) && (dst_width > 0)); + for (i = 0; i < dst_width; i += 3) { + dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + + src_ptr[stride + 0] + src_ptr[stride + 1] + + src_ptr[stride + 2]) * (65536 / 6) >> 16; + dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] + + src_ptr[stride + 3] + src_ptr[stride + 4] + + src_ptr[stride + 5]) * (65536 / 6) >> 16; + dst_ptr[2] = (src_ptr[6] + src_ptr[7] + + src_ptr[stride + 6] + src_ptr[stride + 7]) * + (65536 / 4) >> 16; + src_ptr += 8; + dst_ptr += 3; + } +} + +void ScaleRowDown38_2_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride, + uint16* dst_ptr, int dst_width) { + intptr_t stride = src_stride; + int i; + assert((dst_width % 3 == 0) && (dst_width > 0)); + for (i = 0; i < dst_width; i += 3) { + dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + + src_ptr[stride + 0] + src_ptr[stride + 1] + + src_ptr[stride + 2]) * (65536 / 6) >> 16; + dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] + + src_ptr[stride + 3] + src_ptr[stride + 4] + + src_ptr[stride + 5]) * (65536 / 6) >> 16; + dst_ptr[2] = (src_ptr[6] + src_ptr[7] + + src_ptr[stride + 6] + src_ptr[stride + 7]) * + (65536 / 4) >> 16; + src_ptr += 8; + dst_ptr += 3; + } +} + +void ScaleAddRows_C(const uint8* src_ptr, ptrdiff_t src_stride, + uint16* dst_ptr, int src_width, int src_height) { + int x; + assert(src_width > 0); + assert(src_height > 0); + for (x = 0; x < src_width; ++x) { + const uint8* s = src_ptr + x; + unsigned int sum = 0u; + int y; + for (y = 0; y < src_height; ++y) { + sum += s[0]; + s += src_stride; + } + // TODO(fbarchard): Consider limitting height to 256 to avoid overflow. + dst_ptr[x] = sum < 65535u ? sum : 65535u; + } +} + +void ScaleAddRows_16_C(const uint16* src_ptr, ptrdiff_t src_stride, + uint32* dst_ptr, int src_width, int src_height) { + int x; + assert(src_width > 0); + assert(src_height > 0); + for (x = 0; x < src_width; ++x) { + const uint16* s = src_ptr + x; + unsigned int sum = 0u; + int y; + for (y = 0; y < src_height; ++y) { + sum += s[0]; + s += src_stride; + } + // No risk of overflow here now + dst_ptr[x] = sum; + } +} + +void ScaleARGBRowDown2_C(const uint8* src_argb, + ptrdiff_t src_stride, + uint8* dst_argb, int dst_width) { + const uint32* src = (const uint32*)(src_argb); + uint32* dst = (uint32*)(dst_argb); + + int x; + for (x = 0; x < dst_width - 1; x += 2) { + dst[0] = src[1]; + dst[1] = src[3]; + src += 4; + dst += 2; + } + if (dst_width & 1) { + dst[0] = src[1]; + } +} + +void ScaleARGBRowDown2Linear_C(const uint8* src_argb, + ptrdiff_t src_stride, + uint8* dst_argb, int dst_width) { + int x; + for (x = 0; x < dst_width; ++x) { + dst_argb[0] = (src_argb[0] + src_argb[4] + 1) >> 1; + dst_argb[1] = (src_argb[1] + src_argb[5] + 1) >> 1; + dst_argb[2] = (src_argb[2] + src_argb[6] + 1) >> 1; + dst_argb[3] = (src_argb[3] + src_argb[7] + 1) >> 1; + src_argb += 8; + dst_argb += 4; + } +} + +void ScaleARGBRowDown2Box_C(const uint8* src_argb, ptrdiff_t src_stride, + uint8* dst_argb, int dst_width) { + int x; + for (x = 0; x < dst_width; ++x) { + dst_argb[0] = (src_argb[0] + src_argb[4] + + src_argb[src_stride] + src_argb[src_stride + 4] + 2) >> 2; + dst_argb[1] = (src_argb[1] + src_argb[5] + + src_argb[src_stride + 1] + src_argb[src_stride + 5] + 2) >> 2; + dst_argb[2] = (src_argb[2] + src_argb[6] + + src_argb[src_stride + 2] + src_argb[src_stride + 6] + 2) >> 2; + dst_argb[3] = (src_argb[3] + src_argb[7] + + src_argb[src_stride + 3] + src_argb[src_stride + 7] + 2) >> 2; + src_argb += 8; + dst_argb += 4; + } +} + +void ScaleARGBRowDownEven_C(const uint8* src_argb, ptrdiff_t src_stride, + int src_stepx, + uint8* dst_argb, int dst_width) { + const uint32* src = (const uint32*)(src_argb); + uint32* dst = (uint32*)(dst_argb); + + int x; + for (x = 0; x < dst_width - 1; x += 2) { + dst[0] = src[0]; + dst[1] = src[src_stepx]; + src += src_stepx * 2; + dst += 2; + } + if (dst_width & 1) { + dst[0] = src[0]; + } +} + +void ScaleARGBRowDownEvenBox_C(const uint8* src_argb, + ptrdiff_t src_stride, + int src_stepx, + uint8* dst_argb, int dst_width) { + int x; + for (x = 0; x < dst_width; ++x) { + dst_argb[0] = (src_argb[0] + src_argb[4] + + src_argb[src_stride] + src_argb[src_stride + 4] + 2) >> 2; + dst_argb[1] = (src_argb[1] + src_argb[5] + + src_argb[src_stride + 1] + src_argb[src_stride + 5] + 2) >> 2; + dst_argb[2] = (src_argb[2] + src_argb[6] + + src_argb[src_stride + 2] + src_argb[src_stride + 6] + 2) >> 2; + dst_argb[3] = (src_argb[3] + src_argb[7] + + src_argb[src_stride + 3] + src_argb[src_stride + 7] + 2) >> 2; + src_argb += src_stepx * 4; + dst_argb += 4; + } +} + +// Scales a single row of pixels using point sampling. +void ScaleARGBCols_C(uint8* dst_argb, const uint8* src_argb, + int dst_width, int x, int dx) { + const uint32* src = (const uint32*)(src_argb); + uint32* dst = (uint32*)(dst_argb); + int j; + for (j = 0; j < dst_width - 1; j += 2) { + dst[0] = src[x >> 16]; + x += dx; + dst[1] = src[x >> 16]; + x += dx; + dst += 2; + } + if (dst_width & 1) { + dst[0] = src[x >> 16]; + } +} + +void ScaleARGBCols64_C(uint8* dst_argb, const uint8* src_argb, + int dst_width, int x32, int dx) { + int64 x = (int64)(x32); + const uint32* src = (const uint32*)(src_argb); + uint32* dst = (uint32*)(dst_argb); + int j; + for (j = 0; j < dst_width - 1; j += 2) { + dst[0] = src[x >> 16]; + x += dx; + dst[1] = src[x >> 16]; + x += dx; + dst += 2; + } + if (dst_width & 1) { + dst[0] = src[x >> 16]; + } +} + +// Scales a single row of pixels up by 2x using point sampling. +void ScaleARGBColsUp2_C(uint8* dst_argb, const uint8* src_argb, + int dst_width, int x, int dx) { + const uint32* src = (const uint32*)(src_argb); + uint32* dst = (uint32*)(dst_argb); + int j; + for (j = 0; j < dst_width - 1; j += 2) { + dst[1] = dst[0] = src[0]; + src += 1; + dst += 2; + } + if (dst_width & 1) { + dst[0] = src[0]; + } +} + +// Mimics SSSE3 blender +#define BLENDER1(a, b, f) ((a) * (0x7f ^ f) + (b) * f) >> 7 +#define BLENDERC(a, b, f, s) (uint32)( \ + BLENDER1(((a) >> s) & 255, ((b) >> s) & 255, f) << s) +#define BLENDER(a, b, f) \ + BLENDERC(a, b, f, 24) | BLENDERC(a, b, f, 16) | \ + BLENDERC(a, b, f, 8) | BLENDERC(a, b, f, 0) + +void ScaleARGBFilterCols_C(uint8* dst_argb, const uint8* src_argb, + int dst_width, int x, int dx) { + const uint32* src = (const uint32*)(src_argb); + uint32* dst = (uint32*)(dst_argb); + int j; + for (j = 0; j < dst_width - 1; j += 2) { + int xi = x >> 16; + int xf = (x >> 9) & 0x7f; + uint32 a = src[xi]; + uint32 b = src[xi + 1]; + dst[0] = BLENDER(a, b, xf); + x += dx; + xi = x >> 16; + xf = (x >> 9) & 0x7f; + a = src[xi]; + b = src[xi + 1]; + dst[1] = BLENDER(a, b, xf); + x += dx; + dst += 2; + } + if (dst_width & 1) { + int xi = x >> 16; + int xf = (x >> 9) & 0x7f; + uint32 a = src[xi]; + uint32 b = src[xi + 1]; + dst[0] = BLENDER(a, b, xf); + } +} + +void ScaleARGBFilterCols64_C(uint8* dst_argb, const uint8* src_argb, + int dst_width, int x32, int dx) { + int64 x = (int64)(x32); + const uint32* src = (const uint32*)(src_argb); + uint32* dst = (uint32*)(dst_argb); + int j; + for (j = 0; j < dst_width - 1; j += 2) { + int64 xi = x >> 16; + int xf = (x >> 9) & 0x7f; + uint32 a = src[xi]; + uint32 b = src[xi + 1]; + dst[0] = BLENDER(a, b, xf); + x += dx; + xi = x >> 16; + xf = (x >> 9) & 0x7f; + a = src[xi]; + b = src[xi + 1]; + dst[1] = BLENDER(a, b, xf); + x += dx; + dst += 2; + } + if (dst_width & 1) { + int64 xi = x >> 16; + int xf = (x >> 9) & 0x7f; + uint32 a = src[xi]; + uint32 b = src[xi + 1]; + dst[0] = BLENDER(a, b, xf); + } +} +#undef BLENDER1 +#undef BLENDERC +#undef BLENDER + +// Scale plane vertically with bilinear interpolation. +void ScalePlaneVertical(int src_height, + int dst_width, int dst_height, + int src_stride, int dst_stride, + const uint8* src_argb, uint8* dst_argb, + int x, int y, int dy, + int bpp, enum FilterMode filtering) { + // TODO(fbarchard): Allow higher bpp. + int dst_width_bytes = dst_width * bpp; + void (*InterpolateRow)(uint8* dst_argb, const uint8* src_argb, + ptrdiff_t src_stride, int dst_width, int source_y_fraction) = + InterpolateRow_C; + const int max_y = (src_height > 1) ? ((src_height - 1) << 16) - 1 : 0; + int j; + assert(bpp >= 1 && bpp <= 4); + assert(src_height != 0); + assert(dst_width > 0); + assert(dst_height > 0); + src_argb += (x >> 16) * bpp; +#if defined(HAS_INTERPOLATEROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && dst_width_bytes >= 16) { + InterpolateRow = InterpolateRow_Any_SSE2; + if (IS_ALIGNED(dst_width_bytes, 16)) { + InterpolateRow = InterpolateRow_Unaligned_SSE2; + if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16) && + IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) { + InterpolateRow = InterpolateRow_SSE2; + } + } + } +#endif +#if defined(HAS_INTERPOLATEROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && dst_width_bytes >= 16) { + InterpolateRow = InterpolateRow_Any_SSSE3; + if (IS_ALIGNED(dst_width_bytes, 16)) { + InterpolateRow = InterpolateRow_Unaligned_SSSE3; + if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16) && + IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) { + InterpolateRow = InterpolateRow_SSSE3; + } + } + } +#endif +#if defined(HAS_INTERPOLATEROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2) && dst_width_bytes >= 32) { + InterpolateRow = InterpolateRow_Any_AVX2; + if (IS_ALIGNED(dst_width_bytes, 32)) { + InterpolateRow = InterpolateRow_AVX2; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && dst_width_bytes >= 16) { + InterpolateRow = InterpolateRow_Any_NEON; + if (IS_ALIGNED(dst_width_bytes, 16)) { + InterpolateRow = InterpolateRow_NEON; + } + } +#endif +#if defined(HAS_INTERPOLATEROWS_MIPS_DSPR2) + if (TestCpuFlag(kCpuHasMIPS_DSPR2) && dst_width_bytes >= 4 && + IS_ALIGNED(src_argb, 4) && IS_ALIGNED(src_stride, 4) && + IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride, 4)) { + InterpolateRow = InterpolateRow_Any_MIPS_DSPR2; + if (IS_ALIGNED(dst_width_bytes, 4)) { + InterpolateRow = InterpolateRow_MIPS_DSPR2; + } + } +#endif + for (j = 0; j < dst_height; ++j) { + int yi; + int yf; + if (y > max_y) { + y = max_y; + } + yi = y >> 16; + yf = filtering ? ((y >> 8) & 255) : 0; + InterpolateRow(dst_argb, src_argb + yi * src_stride, + src_stride, dst_width_bytes, yf); + dst_argb += dst_stride; + y += dy; + } +} +void ScalePlaneVertical_16(int src_height, + int dst_width, int dst_height, + int src_stride, int dst_stride, + const uint16* src_argb, uint16* dst_argb, + int x, int y, int dy, + int wpp, enum FilterMode filtering) { + // TODO(fbarchard): Allow higher wpp. + int dst_width_words = dst_width * wpp; + void (*InterpolateRow)(uint16* dst_argb, const uint16* src_argb, + ptrdiff_t src_stride, int dst_width, int source_y_fraction) = + InterpolateRow_16_C; + const int max_y = (src_height > 1) ? ((src_height - 1) << 16) - 1 : 0; + int j; + assert(wpp >= 1 && wpp <= 2); + assert(src_height != 0); + assert(dst_width > 0); + assert(dst_height > 0); + src_argb += (x >> 16) * wpp; +#if defined(HAS_INTERPOLATEROW_16_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && dst_width_bytes >= 16) { + InterpolateRow = InterpolateRow_Any_16_SSE2; + if (IS_ALIGNED(dst_width_bytes, 16)) { + InterpolateRow = InterpolateRow_Unaligned_16_SSE2; + if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16) && + IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) { + InterpolateRow = InterpolateRow_16_SSE2; + } + } + } +#endif +#if defined(HAS_INTERPOLATEROW_16_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && dst_width_bytes >= 16) { + InterpolateRow = InterpolateRow_Any_16_SSSE3; + if (IS_ALIGNED(dst_width_bytes, 16)) { + InterpolateRow = InterpolateRow_Unaligned_16_SSSE3; + if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16) && + IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) { + InterpolateRow = InterpolateRow_16_SSSE3; + } + } + } +#endif +#if defined(HAS_INTERPOLATEROW_16_AVX2) + if (TestCpuFlag(kCpuHasAVX2) && dst_width_bytes >= 32) { + InterpolateRow = InterpolateRow_Any_16_AVX2; + if (IS_ALIGNED(dst_width_bytes, 32)) { + InterpolateRow = InterpolateRow_16_AVX2; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_16_NEON) + if (TestCpuFlag(kCpuHasNEON) && dst_width_bytes >= 16) { + InterpolateRow = InterpolateRow_Any_16_NEON; + if (IS_ALIGNED(dst_width_bytes, 16)) { + InterpolateRow = InterpolateRow_16_NEON; + } + } +#endif +#if defined(HAS_INTERPOLATEROWS_16_MIPS_DSPR2) + if (TestCpuFlag(kCpuHasMIPS_DSPR2) && dst_width_bytes >= 4 && + IS_ALIGNED(src_argb, 4) && IS_ALIGNED(src_stride, 4) && + IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride, 4)) { + InterpolateRow = InterpolateRow_Any_16_MIPS_DSPR2; + if (IS_ALIGNED(dst_width_bytes, 4)) { + InterpolateRow = InterpolateRow_16_MIPS_DSPR2; + } + } +#endif + for (j = 0; j < dst_height; ++j) { + int yi; + int yf; + if (y > max_y) { + y = max_y; + } + yi = y >> 16; + yf = filtering ? ((y >> 8) & 255) : 0; + InterpolateRow(dst_argb, src_argb + yi * src_stride, + src_stride, dst_width_words, yf); + dst_argb += dst_stride; + y += dy; + } +} + +// Simplify the filtering based on scale factors. +enum FilterMode ScaleFilterReduce(int src_width, int src_height, + int dst_width, int dst_height, + enum FilterMode filtering) { + if (src_width < 0) { + src_width = -src_width; + } + if (src_height < 0) { + src_height = -src_height; + } + if (filtering == kFilterBox) { + // If scaling both axis to 0.5 or larger, switch from Box to Bilinear. + if (dst_width * 2 >= src_width && dst_height * 2 >= src_height) { + filtering = kFilterBilinear; + } + // If scaling to larger, switch from Box to Bilinear. + if (dst_width >= src_width || dst_height >= src_height) { + filtering = kFilterBilinear; + } + } + if (filtering == kFilterBilinear) { + if (src_height == 1) { + filtering = kFilterLinear; + } + // TODO(fbarchard): Detect any odd scale factor and reduce to Linear. + if (dst_height == src_height || dst_height * 3 == src_height) { + filtering = kFilterLinear; + } + // TODO(fbarchard): Remove 1 pixel wide filter restriction, which is to + // avoid reading 2 pixels horizontally that causes memory exception. + if (src_width == 1) { + filtering = kFilterNone; + } + } + if (filtering == kFilterLinear) { + if (src_width == 1) { + filtering = kFilterNone; + } + // TODO(fbarchard): Detect any odd scale factor and reduce to None. + if (dst_width == src_width || dst_width * 3 == src_width) { + filtering = kFilterNone; + } + } + return filtering; +} + +// Divide num by div and return as 16.16 fixed point result. +int FixedDiv_C(int num, int div) { + return (int)(((int64)(num) << 16) / div); +} + +// Divide num by div and return as 16.16 fixed point result. +int FixedDiv1_C(int num, int div) { + return (int)((((int64)(num) << 16) - 0x00010001) / + (div - 1)); +} + +#define CENTERSTART(dx, s) (dx < 0) ? -((-dx >> 1) + s) : ((dx >> 1) + s) + +// Compute slope values for stepping. +void ScaleSlope(int src_width, int src_height, + int dst_width, int dst_height, + enum FilterMode filtering, + int* x, int* y, int* dx, int* dy) { + assert(x != NULL); + assert(y != NULL); + assert(dx != NULL); + assert(dy != NULL); + assert(src_width != 0); + assert(src_height != 0); + assert(dst_width > 0); + assert(dst_height > 0); + // Check for 1 pixel and avoid FixedDiv overflow. + if (dst_width == 1 && src_width >= 32768) { + dst_width = src_width; + } + if (dst_height == 1 && src_height >= 32768) { + dst_height = src_height; + } + if (filtering == kFilterBox) { + // Scale step for point sampling duplicates all pixels equally. + *dx = FixedDiv(Abs(src_width), dst_width); + *dy = FixedDiv(src_height, dst_height); + *x = 0; + *y = 0; + } else if (filtering == kFilterBilinear) { + // Scale step for bilinear sampling renders last pixel once for upsample. + if (dst_width <= Abs(src_width)) { + *dx = FixedDiv(Abs(src_width), dst_width); + *x = CENTERSTART(*dx, -32768); // Subtract 0.5 (32768) to center filter. + } else if (dst_width > 1) { + *dx = FixedDiv1(Abs(src_width), dst_width); + *x = 0; + } + if (dst_height <= src_height) { + *dy = FixedDiv(src_height, dst_height); + *y = CENTERSTART(*dy, -32768); // Subtract 0.5 (32768) to center filter. + } else if (dst_height > 1) { + *dy = FixedDiv1(src_height, dst_height); + *y = 0; + } + } else if (filtering == kFilterLinear) { + // Scale step for bilinear sampling renders last pixel once for upsample. + if (dst_width <= Abs(src_width)) { + *dx = FixedDiv(Abs(src_width), dst_width); + *x = CENTERSTART(*dx, -32768); // Subtract 0.5 (32768) to center filter. + } else if (dst_width > 1) { + *dx = FixedDiv1(Abs(src_width), dst_width); + *x = 0; + } + *dy = FixedDiv(src_height, dst_height); + *y = *dy >> 1; + } else { + // Scale step for point sampling duplicates all pixels equally. + *dx = FixedDiv(Abs(src_width), dst_width); + *dy = FixedDiv(src_height, dst_height); + *x = CENTERSTART(*dx, 0); + *y = CENTERSTART(*dy, 0); + } + // Negative src_width means horizontally mirror. + if (src_width < 0) { + *x += (dst_width - 1) * *dx; + *dx = -*dx; + // src_width = -src_width; // Caller must do this. + } +} +#undef CENTERSTART + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/TMessagesProj/jni/libyuv/source/scale_mips.cc b/TMessagesProj/jni/libyuv/source/scale_mips.cc new file mode 100644 index 000000000..3eb4f27c4 --- /dev/null +++ b/TMessagesProj/jni/libyuv/source/scale_mips.cc @@ -0,0 +1,654 @@ +/* + * Copyright 2012 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/basic_types.h" +#include "libyuv/row.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// This module is for GCC MIPS DSPR2 +#if !defined(LIBYUV_DISABLE_MIPS) && \ + defined(__mips_dsp) && (__mips_dsp_rev >= 2) && \ + (_MIPS_SIM == _MIPS_SIM_ABI32) + +void ScaleRowDown2_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width) { + __asm__ __volatile__( + ".set push \n" + ".set noreorder \n" + + "srl $t9, %[dst_width], 4 \n" // iterations -> by 16 + "beqz $t9, 2f \n" + " nop \n" + + ".p2align 2 \n" + "1: \n" + "lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0| + "lw $t1, 4(%[src_ptr]) \n" // |7|6|5|4| + "lw $t2, 8(%[src_ptr]) \n" // |11|10|9|8| + "lw $t3, 12(%[src_ptr]) \n" // |15|14|13|12| + "lw $t4, 16(%[src_ptr]) \n" // |19|18|17|16| + "lw $t5, 20(%[src_ptr]) \n" // |23|22|21|20| + "lw $t6, 24(%[src_ptr]) \n" // |27|26|25|24| + "lw $t7, 28(%[src_ptr]) \n" // |31|30|29|28| + // TODO(fbarchard): Use odd pixels instead of even. + "precr.qb.ph $t8, $t1, $t0 \n" // |6|4|2|0| + "precr.qb.ph $t0, $t3, $t2 \n" // |14|12|10|8| + "precr.qb.ph $t1, $t5, $t4 \n" // |22|20|18|16| + "precr.qb.ph $t2, $t7, $t6 \n" // |30|28|26|24| + "addiu %[src_ptr], %[src_ptr], 32 \n" + "addiu $t9, $t9, -1 \n" + "sw $t8, 0(%[dst]) \n" + "sw $t0, 4(%[dst]) \n" + "sw $t1, 8(%[dst]) \n" + "sw $t2, 12(%[dst]) \n" + "bgtz $t9, 1b \n" + " addiu %[dst], %[dst], 16 \n" + + "2: \n" + "andi $t9, %[dst_width], 0xf \n" // residue + "beqz $t9, 3f \n" + " nop \n" + + "21: \n" + "lbu $t0, 0(%[src_ptr]) \n" + "addiu %[src_ptr], %[src_ptr], 2 \n" + "addiu $t9, $t9, -1 \n" + "sb $t0, 0(%[dst]) \n" + "bgtz $t9, 21b \n" + " addiu %[dst], %[dst], 1 \n" + + "3: \n" + ".set pop \n" + : [src_ptr] "+r" (src_ptr), + [dst] "+r" (dst) + : [dst_width] "r" (dst_width) + : "t0", "t1", "t2", "t3", "t4", "t5", + "t6", "t7", "t8", "t9" + ); +} + +void ScaleRowDown2Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width) { + const uint8* t = src_ptr + src_stride; + + __asm__ __volatile__ ( + ".set push \n" + ".set noreorder \n" + + "srl $t9, %[dst_width], 3 \n" // iterations -> step 8 + "bltz $t9, 2f \n" + " nop \n" + + ".p2align 2 \n" + "1: \n" + "lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0| + "lw $t1, 4(%[src_ptr]) \n" // |7|6|5|4| + "lw $t2, 8(%[src_ptr]) \n" // |11|10|9|8| + "lw $t3, 12(%[src_ptr]) \n" // |15|14|13|12| + "lw $t4, 0(%[t]) \n" // |19|18|17|16| + "lw $t5, 4(%[t]) \n" // |23|22|21|20| + "lw $t6, 8(%[t]) \n" // |27|26|25|24| + "lw $t7, 12(%[t]) \n" // |31|30|29|28| + "addiu $t9, $t9, -1 \n" + "srl $t8, $t0, 16 \n" // |X|X|3|2| + "ins $t0, $t4, 16, 16 \n" // |17|16|1|0| + "ins $t4, $t8, 0, 16 \n" // |19|18|3|2| + "raddu.w.qb $t0, $t0 \n" // |17+16+1+0| + "raddu.w.qb $t4, $t4 \n" // |19+18+3+2| + "shra_r.w $t0, $t0, 2 \n" // |t0+2|>>2 + "shra_r.w $t4, $t4, 2 \n" // |t4+2|>>2 + "srl $t8, $t1, 16 \n" // |X|X|7|6| + "ins $t1, $t5, 16, 16 \n" // |21|20|5|4| + "ins $t5, $t8, 0, 16 \n" // |22|23|7|6| + "raddu.w.qb $t1, $t1 \n" // |21+20+5+4| + "raddu.w.qb $t5, $t5 \n" // |23+22+7+6| + "shra_r.w $t1, $t1, 2 \n" // |t1+2|>>2 + "shra_r.w $t5, $t5, 2 \n" // |t5+2|>>2 + "srl $t8, $t2, 16 \n" // |X|X|11|10| + "ins $t2, $t6, 16, 16 \n" // |25|24|9|8| + "ins $t6, $t8, 0, 16 \n" // |27|26|11|10| + "raddu.w.qb $t2, $t2 \n" // |25+24+9+8| + "raddu.w.qb $t6, $t6 \n" // |27+26+11+10| + "shra_r.w $t2, $t2, 2 \n" // |t2+2|>>2 + "shra_r.w $t6, $t6, 2 \n" // |t5+2|>>2 + "srl $t8, $t3, 16 \n" // |X|X|15|14| + "ins $t3, $t7, 16, 16 \n" // |29|28|13|12| + "ins $t7, $t8, 0, 16 \n" // |31|30|15|14| + "raddu.w.qb $t3, $t3 \n" // |29+28+13+12| + "raddu.w.qb $t7, $t7 \n" // |31+30+15+14| + "shra_r.w $t3, $t3, 2 \n" // |t3+2|>>2 + "shra_r.w $t7, $t7, 2 \n" // |t7+2|>>2 + "addiu %[src_ptr], %[src_ptr], 16 \n" + "addiu %[t], %[t], 16 \n" + "sb $t0, 0(%[dst]) \n" + "sb $t4, 1(%[dst]) \n" + "sb $t1, 2(%[dst]) \n" + "sb $t5, 3(%[dst]) \n" + "sb $t2, 4(%[dst]) \n" + "sb $t6, 5(%[dst]) \n" + "sb $t3, 6(%[dst]) \n" + "sb $t7, 7(%[dst]) \n" + "bgtz $t9, 1b \n" + " addiu %[dst], %[dst], 8 \n" + + "2: \n" + "andi $t9, %[dst_width], 0x7 \n" // x = residue + "beqz $t9, 3f \n" + " nop \n" + + "21: \n" + "lwr $t1, 0(%[src_ptr]) \n" + "lwl $t1, 3(%[src_ptr]) \n" + "lwr $t2, 0(%[t]) \n" + "lwl $t2, 3(%[t]) \n" + "srl $t8, $t1, 16 \n" + "ins $t1, $t2, 16, 16 \n" + "ins $t2, $t8, 0, 16 \n" + "raddu.w.qb $t1, $t1 \n" + "raddu.w.qb $t2, $t2 \n" + "shra_r.w $t1, $t1, 2 \n" + "shra_r.w $t2, $t2, 2 \n" + "sb $t1, 0(%[dst]) \n" + "sb $t2, 1(%[dst]) \n" + "addiu %[src_ptr], %[src_ptr], 4 \n" + "addiu $t9, $t9, -2 \n" + "addiu %[t], %[t], 4 \n" + "bgtz $t9, 21b \n" + " addiu %[dst], %[dst], 2 \n" + + "3: \n" + ".set pop \n" + + : [src_ptr] "+r" (src_ptr), + [dst] "+r" (dst), [t] "+r" (t) + : [dst_width] "r" (dst_width) + : "t0", "t1", "t2", "t3", "t4", "t5", + "t6", "t7", "t8", "t9" + ); +} + +void ScaleRowDown4_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width) { + __asm__ __volatile__ ( + ".set push \n" + ".set noreorder \n" + + "srl $t9, %[dst_width], 3 \n" + "beqz $t9, 2f \n" + " nop \n" + + ".p2align 2 \n" + "1: \n" + "lw $t1, 0(%[src_ptr]) \n" // |3|2|1|0| + "lw $t2, 4(%[src_ptr]) \n" // |7|6|5|4| + "lw $t3, 8(%[src_ptr]) \n" // |11|10|9|8| + "lw $t4, 12(%[src_ptr]) \n" // |15|14|13|12| + "lw $t5, 16(%[src_ptr]) \n" // |19|18|17|16| + "lw $t6, 20(%[src_ptr]) \n" // |23|22|21|20| + "lw $t7, 24(%[src_ptr]) \n" // |27|26|25|24| + "lw $t8, 28(%[src_ptr]) \n" // |31|30|29|28| + "precr.qb.ph $t1, $t2, $t1 \n" // |6|4|2|0| + "precr.qb.ph $t2, $t4, $t3 \n" // |14|12|10|8| + "precr.qb.ph $t5, $t6, $t5 \n" // |22|20|18|16| + "precr.qb.ph $t6, $t8, $t7 \n" // |30|28|26|24| + "precr.qb.ph $t1, $t2, $t1 \n" // |12|8|4|0| + "precr.qb.ph $t5, $t6, $t5 \n" // |28|24|20|16| + "addiu %[src_ptr], %[src_ptr], 32 \n" + "addiu $t9, $t9, -1 \n" + "sw $t1, 0(%[dst]) \n" + "sw $t5, 4(%[dst]) \n" + "bgtz $t9, 1b \n" + " addiu %[dst], %[dst], 8 \n" + + "2: \n" + "andi $t9, %[dst_width], 7 \n" // residue + "beqz $t9, 3f \n" + " nop \n" + + "21: \n" + "lbu $t1, 0(%[src_ptr]) \n" + "addiu %[src_ptr], %[src_ptr], 4 \n" + "addiu $t9, $t9, -1 \n" + "sb $t1, 0(%[dst]) \n" + "bgtz $t9, 21b \n" + " addiu %[dst], %[dst], 1 \n" + + "3: \n" + ".set pop \n" + : [src_ptr] "+r" (src_ptr), + [dst] "+r" (dst) + : [dst_width] "r" (dst_width) + : "t1", "t2", "t3", "t4", "t5", + "t6", "t7", "t8", "t9" + ); +} + +void ScaleRowDown4Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width) { + intptr_t stride = src_stride; + const uint8* s1 = src_ptr + stride; + const uint8* s2 = s1 + stride; + const uint8* s3 = s2 + stride; + + __asm__ __volatile__ ( + ".set push \n" + ".set noreorder \n" + + "srl $t9, %[dst_width], 1 \n" + "andi $t8, %[dst_width], 1 \n" + + ".p2align 2 \n" + "1: \n" + "lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0| + "lw $t1, 0(%[s1]) \n" // |7|6|5|4| + "lw $t2, 0(%[s2]) \n" // |11|10|9|8| + "lw $t3, 0(%[s3]) \n" // |15|14|13|12| + "lw $t4, 4(%[src_ptr]) \n" // |19|18|17|16| + "lw $t5, 4(%[s1]) \n" // |23|22|21|20| + "lw $t6, 4(%[s2]) \n" // |27|26|25|24| + "lw $t7, 4(%[s3]) \n" // |31|30|29|28| + "raddu.w.qb $t0, $t0 \n" // |3 + 2 + 1 + 0| + "raddu.w.qb $t1, $t1 \n" // |7 + 6 + 5 + 4| + "raddu.w.qb $t2, $t2 \n" // |11 + 10 + 9 + 8| + "raddu.w.qb $t3, $t3 \n" // |15 + 14 + 13 + 12| + "raddu.w.qb $t4, $t4 \n" // |19 + 18 + 17 + 16| + "raddu.w.qb $t5, $t5 \n" // |23 + 22 + 21 + 20| + "raddu.w.qb $t6, $t6 \n" // |27 + 26 + 25 + 24| + "raddu.w.qb $t7, $t7 \n" // |31 + 30 + 29 + 28| + "add $t0, $t0, $t1 \n" + "add $t1, $t2, $t3 \n" + "add $t0, $t0, $t1 \n" + "add $t4, $t4, $t5 \n" + "add $t6, $t6, $t7 \n" + "add $t4, $t4, $t6 \n" + "shra_r.w $t0, $t0, 4 \n" + "shra_r.w $t4, $t4, 4 \n" + "sb $t0, 0(%[dst]) \n" + "sb $t4, 1(%[dst]) \n" + "addiu %[src_ptr], %[src_ptr], 8 \n" + "addiu %[s1], %[s1], 8 \n" + "addiu %[s2], %[s2], 8 \n" + "addiu %[s3], %[s3], 8 \n" + "addiu $t9, $t9, -1 \n" + "bgtz $t9, 1b \n" + " addiu %[dst], %[dst], 2 \n" + "beqz $t8, 2f \n" + " nop \n" + + "lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0| + "lw $t1, 0(%[s1]) \n" // |7|6|5|4| + "lw $t2, 0(%[s2]) \n" // |11|10|9|8| + "lw $t3, 0(%[s3]) \n" // |15|14|13|12| + "raddu.w.qb $t0, $t0 \n" // |3 + 2 + 1 + 0| + "raddu.w.qb $t1, $t1 \n" // |7 + 6 + 5 + 4| + "raddu.w.qb $t2, $t2 \n" // |11 + 10 + 9 + 8| + "raddu.w.qb $t3, $t3 \n" // |15 + 14 + 13 + 12| + "add $t0, $t0, $t1 \n" + "add $t1, $t2, $t3 \n" + "add $t0, $t0, $t1 \n" + "shra_r.w $t0, $t0, 4 \n" + "sb $t0, 0(%[dst]) \n" + + "2: \n" + ".set pop \n" + + : [src_ptr] "+r" (src_ptr), + [dst] "+r" (dst), + [s1] "+r" (s1), + [s2] "+r" (s2), + [s3] "+r" (s3) + : [dst_width] "r" (dst_width) + : "t0", "t1", "t2", "t3", "t4", "t5", + "t6","t7", "t8", "t9" + ); +} + +void ScaleRowDown34_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width) { + __asm__ __volatile__ ( + ".set push \n" + ".set noreorder \n" + ".p2align 2 \n" + "1: \n" + "lw $t1, 0(%[src_ptr]) \n" // |3|2|1|0| + "lw $t2, 4(%[src_ptr]) \n" // |7|6|5|4| + "lw $t3, 8(%[src_ptr]) \n" // |11|10|9|8| + "lw $t4, 12(%[src_ptr]) \n" // |15|14|13|12| + "lw $t5, 16(%[src_ptr]) \n" // |19|18|17|16| + "lw $t6, 20(%[src_ptr]) \n" // |23|22|21|20| + "lw $t7, 24(%[src_ptr]) \n" // |27|26|25|24| + "lw $t8, 28(%[src_ptr]) \n" // |31|30|29|28| + "precrq.qb.ph $t0, $t2, $t4 \n" // |7|5|15|13| + "precrq.qb.ph $t9, $t6, $t8 \n" // |23|21|31|30| + "addiu %[dst_width], %[dst_width], -24 \n" + "ins $t1, $t1, 8, 16 \n" // |3|1|0|X| + "ins $t4, $t0, 8, 16 \n" // |X|15|13|12| + "ins $t5, $t5, 8, 16 \n" // |19|17|16|X| + "ins $t8, $t9, 8, 16 \n" // |X|31|29|28| + "addiu %[src_ptr], %[src_ptr], 32 \n" + "packrl.ph $t0, $t3, $t0 \n" // |9|8|7|5| + "packrl.ph $t9, $t7, $t9 \n" // |25|24|23|21| + "prepend $t1, $t2, 8 \n" // |4|3|1|0| + "prepend $t3, $t4, 24 \n" // |15|13|12|11| + "prepend $t5, $t6, 8 \n" // |20|19|17|16| + "prepend $t7, $t8, 24 \n" // |31|29|28|27| + "sw $t1, 0(%[dst]) \n" + "sw $t0, 4(%[dst]) \n" + "sw $t3, 8(%[dst]) \n" + "sw $t5, 12(%[dst]) \n" + "sw $t9, 16(%[dst]) \n" + "sw $t7, 20(%[dst]) \n" + "bnez %[dst_width], 1b \n" + " addiu %[dst], %[dst], 24 \n" + ".set pop \n" + : [src_ptr] "+r" (src_ptr), + [dst] "+r" (dst), + [dst_width] "+r" (dst_width) + : + : "t0", "t1", "t2", "t3", "t4", "t5", + "t6","t7", "t8", "t9" + ); +} + +void ScaleRowDown34_0_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* d, int dst_width) { + __asm__ __volatile__ ( + ".set push \n" + ".set noreorder \n" + "repl.ph $t3, 3 \n" // 0x00030003 + + ".p2align 2 \n" + "1: \n" + "lw $t0, 0(%[src_ptr]) \n" // |S3|S2|S1|S0| + "lwx $t1, %[src_stride](%[src_ptr]) \n" // |T3|T2|T1|T0| + "rotr $t2, $t0, 8 \n" // |S0|S3|S2|S1| + "rotr $t6, $t1, 8 \n" // |T0|T3|T2|T1| + "muleu_s.ph.qbl $t4, $t2, $t3 \n" // |S0*3|S3*3| + "muleu_s.ph.qbl $t5, $t6, $t3 \n" // |T0*3|T3*3| + "andi $t0, $t2, 0xFFFF \n" // |0|0|S2|S1| + "andi $t1, $t6, 0xFFFF \n" // |0|0|T2|T1| + "raddu.w.qb $t0, $t0 \n" + "raddu.w.qb $t1, $t1 \n" + "shra_r.w $t0, $t0, 1 \n" + "shra_r.w $t1, $t1, 1 \n" + "preceu.ph.qbr $t2, $t2 \n" // |0|S2|0|S1| + "preceu.ph.qbr $t6, $t6 \n" // |0|T2|0|T1| + "rotr $t2, $t2, 16 \n" // |0|S1|0|S2| + "rotr $t6, $t6, 16 \n" // |0|T1|0|T2| + "addu.ph $t2, $t2, $t4 \n" + "addu.ph $t6, $t6, $t5 \n" + "sll $t5, $t0, 1 \n" + "add $t0, $t5, $t0 \n" + "shra_r.ph $t2, $t2, 2 \n" + "shra_r.ph $t6, $t6, 2 \n" + "shll.ph $t4, $t2, 1 \n" + "addq.ph $t4, $t4, $t2 \n" + "addu $t0, $t0, $t1 \n" + "addiu %[src_ptr], %[src_ptr], 4 \n" + "shra_r.w $t0, $t0, 2 \n" + "addu.ph $t6, $t6, $t4 \n" + "shra_r.ph $t6, $t6, 2 \n" + "srl $t1, $t6, 16 \n" + "addiu %[dst_width], %[dst_width], -3 \n" + "sb $t1, 0(%[d]) \n" + "sb $t0, 1(%[d]) \n" + "sb $t6, 2(%[d]) \n" + "bgtz %[dst_width], 1b \n" + " addiu %[d], %[d], 3 \n" + "3: \n" + ".set pop \n" + : [src_ptr] "+r" (src_ptr), + [src_stride] "+r" (src_stride), + [d] "+r" (d), + [dst_width] "+r" (dst_width) + : + : "t0", "t1", "t2", "t3", + "t4", "t5", "t6" + ); +} + +void ScaleRowDown34_1_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* d, int dst_width) { + __asm__ __volatile__ ( + ".set push \n" + ".set noreorder \n" + "repl.ph $t2, 3 \n" // 0x00030003 + + ".p2align 2 \n" + "1: \n" + "lw $t0, 0(%[src_ptr]) \n" // |S3|S2|S1|S0| + "lwx $t1, %[src_stride](%[src_ptr]) \n" // |T3|T2|T1|T0| + "rotr $t4, $t0, 8 \n" // |S0|S3|S2|S1| + "rotr $t6, $t1, 8 \n" // |T0|T3|T2|T1| + "muleu_s.ph.qbl $t3, $t4, $t2 \n" // |S0*3|S3*3| + "muleu_s.ph.qbl $t5, $t6, $t2 \n" // |T0*3|T3*3| + "andi $t0, $t4, 0xFFFF \n" // |0|0|S2|S1| + "andi $t1, $t6, 0xFFFF \n" // |0|0|T2|T1| + "raddu.w.qb $t0, $t0 \n" + "raddu.w.qb $t1, $t1 \n" + "shra_r.w $t0, $t0, 1 \n" + "shra_r.w $t1, $t1, 1 \n" + "preceu.ph.qbr $t4, $t4 \n" // |0|S2|0|S1| + "preceu.ph.qbr $t6, $t6 \n" // |0|T2|0|T1| + "rotr $t4, $t4, 16 \n" // |0|S1|0|S2| + "rotr $t6, $t6, 16 \n" // |0|T1|0|T2| + "addu.ph $t4, $t4, $t3 \n" + "addu.ph $t6, $t6, $t5 \n" + "shra_r.ph $t6, $t6, 2 \n" + "shra_r.ph $t4, $t4, 2 \n" + "addu.ph $t6, $t6, $t4 \n" + "addiu %[src_ptr], %[src_ptr], 4 \n" + "shra_r.ph $t6, $t6, 1 \n" + "addu $t0, $t0, $t1 \n" + "addiu %[dst_width], %[dst_width], -3 \n" + "shra_r.w $t0, $t0, 1 \n" + "srl $t1, $t6, 16 \n" + "sb $t1, 0(%[d]) \n" + "sb $t0, 1(%[d]) \n" + "sb $t6, 2(%[d]) \n" + "bgtz %[dst_width], 1b \n" + " addiu %[d], %[d], 3 \n" + "3: \n" + ".set pop \n" + : [src_ptr] "+r" (src_ptr), + [src_stride] "+r" (src_stride), + [d] "+r" (d), + [dst_width] "+r" (dst_width) + : + : "t0", "t1", "t2", "t3", + "t4", "t5", "t6" + ); +} + +void ScaleRowDown38_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width) { + __asm__ __volatile__ ( + ".set push \n" + ".set noreorder \n" + + ".p2align 2 \n" + "1: \n" + "lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0| + "lw $t1, 4(%[src_ptr]) \n" // |7|6|5|4| + "lw $t2, 8(%[src_ptr]) \n" // |11|10|9|8| + "lw $t3, 12(%[src_ptr]) \n" // |15|14|13|12| + "lw $t4, 16(%[src_ptr]) \n" // |19|18|17|16| + "lw $t5, 20(%[src_ptr]) \n" // |23|22|21|20| + "lw $t6, 24(%[src_ptr]) \n" // |27|26|25|24| + "lw $t7, 28(%[src_ptr]) \n" // |31|30|29|28| + "wsbh $t0, $t0 \n" // |2|3|0|1| + "wsbh $t6, $t6 \n" // |26|27|24|25| + "srl $t0, $t0, 8 \n" // |X|2|3|0| + "srl $t3, $t3, 16 \n" // |X|X|15|14| + "srl $t5, $t5, 16 \n" // |X|X|23|22| + "srl $t7, $t7, 16 \n" // |X|X|31|30| + "ins $t1, $t2, 24, 8 \n" // |8|6|5|4| + "ins $t6, $t5, 0, 8 \n" // |26|27|24|22| + "ins $t1, $t0, 0, 16 \n" // |8|6|3|0| + "ins $t6, $t7, 24, 8 \n" // |30|27|24|22| + "prepend $t2, $t3, 24 \n" // |X|15|14|11| + "ins $t4, $t4, 16, 8 \n" // |19|16|17|X| + "ins $t4, $t2, 0, 16 \n" // |19|16|14|11| + "addiu %[src_ptr], %[src_ptr], 32 \n" + "addiu %[dst_width], %[dst_width], -12 \n" + "addiu $t8,%[dst_width], -12 \n" + "sw $t1, 0(%[dst]) \n" + "sw $t4, 4(%[dst]) \n" + "sw $t6, 8(%[dst]) \n" + "bgez $t8, 1b \n" + " addiu %[dst], %[dst], 12 \n" + ".set pop \n" + : [src_ptr] "+r" (src_ptr), + [dst] "+r" (dst), + [dst_width] "+r" (dst_width) + : + : "t0", "t1", "t2", "t3", "t4", + "t5", "t6", "t7", "t8" + ); +} + +void ScaleRowDown38_2_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + intptr_t stride = src_stride; + const uint8* t = src_ptr + stride; + const int c = 0x2AAA; + + __asm__ __volatile__ ( + ".set push \n" + ".set noreorder \n" + + ".p2align 2 \n" + "1: \n" + "lw $t0, 0(%[src_ptr]) \n" // |S3|S2|S1|S0| + "lw $t1, 4(%[src_ptr]) \n" // |S7|S6|S5|S4| + "lw $t2, 0(%[t]) \n" // |T3|T2|T1|T0| + "lw $t3, 4(%[t]) \n" // |T7|T6|T5|T4| + "rotr $t1, $t1, 16 \n" // |S5|S4|S7|S6| + "packrl.ph $t4, $t1, $t3 \n" // |S7|S6|T7|T6| + "packrl.ph $t5, $t3, $t1 \n" // |T5|T4|S5|S4| + "raddu.w.qb $t4, $t4 \n" // S7+S6+T7+T6 + "raddu.w.qb $t5, $t5 \n" // T5+T4+S5+S4 + "precrq.qb.ph $t6, $t0, $t2 \n" // |S3|S1|T3|T1| + "precrq.qb.ph $t6, $t6, $t6 \n" // |S3|T3|S3|T3| + "srl $t4, $t4, 2 \n" // t4 / 4 + "srl $t6, $t6, 16 \n" // |0|0|S3|T3| + "raddu.w.qb $t6, $t6 \n" // 0+0+S3+T3 + "addu $t6, $t5, $t6 \n" + "mul $t6, $t6, %[c] \n" // t6 * 0x2AAA + "sll $t0, $t0, 8 \n" // |S2|S1|S0|0| + "sll $t2, $t2, 8 \n" // |T2|T1|T0|0| + "raddu.w.qb $t0, $t0 \n" // S2+S1+S0+0 + "raddu.w.qb $t2, $t2 \n" // T2+T1+T0+0 + "addu $t0, $t0, $t2 \n" + "mul $t0, $t0, %[c] \n" // t0 * 0x2AAA + "addiu %[src_ptr], %[src_ptr], 8 \n" + "addiu %[t], %[t], 8 \n" + "addiu %[dst_width], %[dst_width], -3 \n" + "addiu %[dst_ptr], %[dst_ptr], 3 \n" + "srl $t6, $t6, 16 \n" + "srl $t0, $t0, 16 \n" + "sb $t4, -1(%[dst_ptr]) \n" + "sb $t6, -2(%[dst_ptr]) \n" + "bgtz %[dst_width], 1b \n" + " sb $t0, -3(%[dst_ptr]) \n" + ".set pop \n" + : [src_ptr] "+r" (src_ptr), + [dst_ptr] "+r" (dst_ptr), + [t] "+r" (t), + [dst_width] "+r" (dst_width) + : [c] "r" (c) + : "t0", "t1", "t2", "t3", "t4", "t5", "t6" + ); +} + +void ScaleRowDown38_3_Box_MIPS_DSPR2(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + intptr_t stride = src_stride; + const uint8* s1 = src_ptr + stride; + stride += stride; + const uint8* s2 = src_ptr + stride; + const int c1 = 0x1C71; + const int c2 = 0x2AAA; + + __asm__ __volatile__ ( + ".set push \n" + ".set noreorder \n" + + ".p2align 2 \n" + "1: \n" + "lw $t0, 0(%[src_ptr]) \n" // |S3|S2|S1|S0| + "lw $t1, 4(%[src_ptr]) \n" // |S7|S6|S5|S4| + "lw $t2, 0(%[s1]) \n" // |T3|T2|T1|T0| + "lw $t3, 4(%[s1]) \n" // |T7|T6|T5|T4| + "lw $t4, 0(%[s2]) \n" // |R3|R2|R1|R0| + "lw $t5, 4(%[s2]) \n" // |R7|R6|R5|R4| + "rotr $t1, $t1, 16 \n" // |S5|S4|S7|S6| + "packrl.ph $t6, $t1, $t3 \n" // |S7|S6|T7|T6| + "raddu.w.qb $t6, $t6 \n" // S7+S6+T7+T6 + "packrl.ph $t7, $t3, $t1 \n" // |T5|T4|S5|S4| + "raddu.w.qb $t7, $t7 \n" // T5+T4+S5+S4 + "sll $t8, $t5, 16 \n" // |R5|R4|0|0| + "raddu.w.qb $t8, $t8 \n" // R5+R4 + "addu $t7, $t7, $t8 \n" + "srl $t8, $t5, 16 \n" // |0|0|R7|R6| + "raddu.w.qb $t8, $t8 \n" // R7 + R6 + "addu $t6, $t6, $t8 \n" + "mul $t6, $t6, %[c2] \n" // t6 * 0x2AAA + "precrq.qb.ph $t8, $t0, $t2 \n" // |S3|S1|T3|T1| + "precrq.qb.ph $t8, $t8, $t4 \n" // |S3|T3|R3|R1| + "srl $t8, $t8, 8 \n" // |0|S3|T3|R3| + "raddu.w.qb $t8, $t8 \n" // S3 + T3 + R3 + "addu $t7, $t7, $t8 \n" + "mul $t7, $t7, %[c1] \n" // t7 * 0x1C71 + "sll $t0, $t0, 8 \n" // |S2|S1|S0|0| + "sll $t2, $t2, 8 \n" // |T2|T1|T0|0| + "sll $t4, $t4, 8 \n" // |R2|R1|R0|0| + "raddu.w.qb $t0, $t0 \n" + "raddu.w.qb $t2, $t2 \n" + "raddu.w.qb $t4, $t4 \n" + "addu $t0, $t0, $t2 \n" + "addu $t0, $t0, $t4 \n" + "mul $t0, $t0, %[c1] \n" // t0 * 0x1C71 + "addiu %[src_ptr], %[src_ptr], 8 \n" + "addiu %[s1], %[s1], 8 \n" + "addiu %[s2], %[s2], 8 \n" + "addiu %[dst_width], %[dst_width], -3 \n" + "addiu %[dst_ptr], %[dst_ptr], 3 \n" + "srl $t6, $t6, 16 \n" + "srl $t7, $t7, 16 \n" + "srl $t0, $t0, 16 \n" + "sb $t6, -1(%[dst_ptr]) \n" + "sb $t7, -2(%[dst_ptr]) \n" + "bgtz %[dst_width], 1b \n" + " sb $t0, -3(%[dst_ptr]) \n" + ".set pop \n" + : [src_ptr] "+r" (src_ptr), + [dst_ptr] "+r" (dst_ptr), + [s1] "+r" (s1), + [s2] "+r" (s2), + [dst_width] "+r" (dst_width) + : [c1] "r" (c1), [c2] "r" (c2) + : "t0", "t1", "t2", "t3", "t4", + "t5", "t6", "t7", "t8" + ); +} + +#endif // defined(__mips_dsp) && (__mips_dsp_rev >= 2) + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif + diff --git a/TMessagesProj/jni/libyuv/source/scale_neon.cc b/TMessagesProj/jni/libyuv/source/scale_neon.cc new file mode 100644 index 000000000..1b8a5ba58 --- /dev/null +++ b/TMessagesProj/jni/libyuv/source/scale_neon.cc @@ -0,0 +1,764 @@ +/* + * Copyright 2011 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/row.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// This module is for GCC Neon. +#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) + +// NEON downscalers with interpolation. +// Provided by Fritz Koenig + +// Read 32x1 throw away even pixels, and write 16x1. +void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width) { + asm volatile ( + ".p2align 2 \n" + "1: \n" + // load even pixels into q0, odd into q1 + MEMACCESS(0) + "vld2.8 {q0, q1}, [%0]! \n" + "subs %2, %2, #16 \n" // 16 processed per loop + MEMACCESS(1) + "vst1.8 {q1}, [%1]! \n" // store odd pixels + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst), // %1 + "+r"(dst_width) // %2 + : + : "q0", "q1" // Clobber List + ); +} + +// Read 32x2 average down and write 16x1. +void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width) { + asm volatile ( + // change the stride to row 2 pointer + "add %1, %0 \n" + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld1.8 {q0, q1}, [%0]! \n" // load row 1 and post inc + MEMACCESS(1) + "vld1.8 {q2, q3}, [%1]! \n" // load row 2 and post inc + "subs %3, %3, #16 \n" // 16 processed per loop + "vpaddl.u8 q0, q0 \n" // row 1 add adjacent + "vpaddl.u8 q1, q1 \n" + "vpadal.u8 q0, q2 \n" // row 2 add adjacent + row1 + "vpadal.u8 q1, q3 \n" + "vrshrn.u16 d0, q0, #2 \n" // downshift, round and pack + "vrshrn.u16 d1, q1, #2 \n" + MEMACCESS(2) + "vst1.8 {q0}, [%2]! \n" + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(src_stride), // %1 + "+r"(dst), // %2 + "+r"(dst_width) // %3 + : + : "q0", "q1", "q2", "q3" // Clobber List + ); +} + +void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile ( + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 + "subs %2, %2, #8 \n" // 8 processed per loop + MEMACCESS(1) + "vst1.8 {d2}, [%1]! \n" + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : + : "q0", "q1", "memory", "cc" + ); +} + +void ScaleRowDown4Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + const uint8* src_ptr1 = src_ptr + src_stride; + const uint8* src_ptr2 = src_ptr + src_stride * 2; + const uint8* src_ptr3 = src_ptr + src_stride * 3; +asm volatile ( + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld1.8 {q0}, [%0]! \n" // load up 16x4 + MEMACCESS(3) + "vld1.8 {q1}, [%3]! \n" + MEMACCESS(4) + "vld1.8 {q2}, [%4]! \n" + MEMACCESS(5) + "vld1.8 {q3}, [%5]! \n" + "subs %2, %2, #4 \n" + "vpaddl.u8 q0, q0 \n" + "vpadal.u8 q0, q1 \n" + "vpadal.u8 q0, q2 \n" + "vpadal.u8 q0, q3 \n" + "vpaddl.u16 q0, q0 \n" + "vrshrn.u32 d0, q0, #4 \n" // divide by 16 w/rounding + "vmovn.u16 d0, q0 \n" + MEMACCESS(1) + "vst1.32 {d0[0]}, [%1]! \n" + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width), // %2 + "+r"(src_ptr1), // %3 + "+r"(src_ptr2), // %4 + "+r"(src_ptr3) // %5 + : + : "q0", "q1", "q2", "q3", "memory", "cc" + ); +} + +// Down scale from 4 to 3 pixels. Use the neon multilane read/write +// to load up the every 4th pixel into a 4 different registers. +// Point samples 32 pixels to 24 pixels. +void ScaleRowDown34_NEON(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile ( + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 + "subs %2, %2, #24 \n" + "vmov d2, d3 \n" // order d0, d1, d2 + MEMACCESS(1) + "vst3.8 {d0, d1, d2}, [%1]! \n" + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : + : "d0", "d1", "d2", "d3", "memory", "cc" + ); +} + +void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile ( + "vmov.u8 d24, #3 \n" + "add %3, %0 \n" + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 + MEMACCESS(3) + "vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1 + "subs %2, %2, #24 \n" + + // filter src line 0 with src line 1 + // expand chars to shorts to allow for room + // when adding lines together + "vmovl.u8 q8, d4 \n" + "vmovl.u8 q9, d5 \n" + "vmovl.u8 q10, d6 \n" + "vmovl.u8 q11, d7 \n" + + // 3 * line_0 + line_1 + "vmlal.u8 q8, d0, d24 \n" + "vmlal.u8 q9, d1, d24 \n" + "vmlal.u8 q10, d2, d24 \n" + "vmlal.u8 q11, d3, d24 \n" + + // (3 * line_0 + line_1) >> 2 + "vqrshrn.u16 d0, q8, #2 \n" + "vqrshrn.u16 d1, q9, #2 \n" + "vqrshrn.u16 d2, q10, #2 \n" + "vqrshrn.u16 d3, q11, #2 \n" + + // a0 = (src[0] * 3 + s[1] * 1) >> 2 + "vmovl.u8 q8, d1 \n" + "vmlal.u8 q8, d0, d24 \n" + "vqrshrn.u16 d0, q8, #2 \n" + + // a1 = (src[1] * 1 + s[2] * 1) >> 1 + "vrhadd.u8 d1, d1, d2 \n" + + // a2 = (src[2] * 1 + s[3] * 3) >> 2 + "vmovl.u8 q8, d2 \n" + "vmlal.u8 q8, d3, d24 \n" + "vqrshrn.u16 d2, q8, #2 \n" + + MEMACCESS(1) + "vst3.8 {d0, d1, d2}, [%1]! \n" + + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width), // %2 + "+r"(src_stride) // %3 + : + : "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "d24", "memory", "cc" + ); +} + +void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile ( + "vmov.u8 d24, #3 \n" + "add %3, %0 \n" + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 + MEMACCESS(3) + "vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1 + "subs %2, %2, #24 \n" + // average src line 0 with src line 1 + "vrhadd.u8 q0, q0, q2 \n" + "vrhadd.u8 q1, q1, q3 \n" + + // a0 = (src[0] * 3 + s[1] * 1) >> 2 + "vmovl.u8 q3, d1 \n" + "vmlal.u8 q3, d0, d24 \n" + "vqrshrn.u16 d0, q3, #2 \n" + + // a1 = (src[1] * 1 + s[2] * 1) >> 1 + "vrhadd.u8 d1, d1, d2 \n" + + // a2 = (src[2] * 1 + s[3] * 3) >> 2 + "vmovl.u8 q3, d2 \n" + "vmlal.u8 q3, d3, d24 \n" + "vqrshrn.u16 d2, q3, #2 \n" + + MEMACCESS(1) + "vst3.8 {d0, d1, d2}, [%1]! \n" + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width), // %2 + "+r"(src_stride) // %3 + : + : "r4", "q0", "q1", "q2", "q3", "d24", "memory", "cc" + ); +} + +#define HAS_SCALEROWDOWN38_NEON +static uvec8 kShuf38 = + { 0, 3, 6, 8, 11, 14, 16, 19, 22, 24, 27, 30, 0, 0, 0, 0 }; +static uvec8 kShuf38_2 = + { 0, 8, 16, 2, 10, 17, 4, 12, 18, 6, 14, 19, 0, 0, 0, 0 }; +static vec16 kMult38_Div6 = + { 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12, + 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12 }; +static vec16 kMult38_Div9 = + { 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18, + 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18 }; + +// 32 -> 12 +void ScaleRowDown38_NEON(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile ( + MEMACCESS(3) + "vld1.8 {q3}, [%3] \n" + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld1.8 {d0, d1, d2, d3}, [%0]! \n" + "subs %2, %2, #12 \n" + "vtbl.u8 d4, {d0, d1, d2, d3}, d6 \n" + "vtbl.u8 d5, {d0, d1, d2, d3}, d7 \n" + MEMACCESS(1) + "vst1.8 {d4}, [%1]! \n" + MEMACCESS(1) + "vst1.32 {d5[0]}, [%1]! \n" + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"(&kShuf38) // %3 + : "d0", "d1", "d2", "d3", "d4", "d5", "memory", "cc" + ); +} + +// 32x3 -> 12x1 +void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + const uint8* src_ptr1 = src_ptr + src_stride * 2; + + asm volatile ( + MEMACCESS(5) + "vld1.16 {q13}, [%5] \n" + MEMACCESS(6) + "vld1.8 {q14}, [%6] \n" + MEMACCESS(7) + "vld1.8 {q15}, [%7] \n" + "add %3, %0 \n" + ".p2align 2 \n" + "1: \n" + + // d0 = 00 40 01 41 02 42 03 43 + // d1 = 10 50 11 51 12 52 13 53 + // d2 = 20 60 21 61 22 62 23 63 + // d3 = 30 70 31 71 32 72 33 73 + MEMACCESS(0) + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" + MEMACCESS(3) + "vld4.8 {d4, d5, d6, d7}, [%3]! \n" + MEMACCESS(4) + "vld4.8 {d16, d17, d18, d19}, [%4]! \n" + "subs %2, %2, #12 \n" + + // Shuffle the input data around to get align the data + // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7 + // d0 = 00 10 01 11 02 12 03 13 + // d1 = 40 50 41 51 42 52 43 53 + "vtrn.u8 d0, d1 \n" + "vtrn.u8 d4, d5 \n" + "vtrn.u8 d16, d17 \n" + + // d2 = 20 30 21 31 22 32 23 33 + // d3 = 60 70 61 71 62 72 63 73 + "vtrn.u8 d2, d3 \n" + "vtrn.u8 d6, d7 \n" + "vtrn.u8 d18, d19 \n" + + // d0 = 00+10 01+11 02+12 03+13 + // d2 = 40+50 41+51 42+52 43+53 + "vpaddl.u8 q0, q0 \n" + "vpaddl.u8 q2, q2 \n" + "vpaddl.u8 q8, q8 \n" + + // d3 = 60+70 61+71 62+72 63+73 + "vpaddl.u8 d3, d3 \n" + "vpaddl.u8 d7, d7 \n" + "vpaddl.u8 d19, d19 \n" + + // combine source lines + "vadd.u16 q0, q2 \n" + "vadd.u16 q0, q8 \n" + "vadd.u16 d4, d3, d7 \n" + "vadd.u16 d4, d19 \n" + + // dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0] + // + s[6 + st * 1] + s[7 + st * 1] + // + s[6 + st * 2] + s[7 + st * 2]) / 6 + "vqrdmulh.s16 q2, q2, q13 \n" + "vmovn.u16 d4, q2 \n" + + // Shuffle 2,3 reg around so that 2 can be added to the + // 0,1 reg and 3 can be added to the 4,5 reg. This + // requires expanding from u8 to u16 as the 0,1 and 4,5 + // registers are already expanded. Then do transposes + // to get aligned. + // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33 + "vmovl.u8 q1, d2 \n" + "vmovl.u8 q3, d6 \n" + "vmovl.u8 q9, d18 \n" + + // combine source lines + "vadd.u16 q1, q3 \n" + "vadd.u16 q1, q9 \n" + + // d4 = xx 20 xx 30 xx 22 xx 32 + // d5 = xx 21 xx 31 xx 23 xx 33 + "vtrn.u32 d2, d3 \n" + + // d4 = xx 20 xx 21 xx 22 xx 23 + // d5 = xx 30 xx 31 xx 32 xx 33 + "vtrn.u16 d2, d3 \n" + + // 0+1+2, 3+4+5 + "vadd.u16 q0, q1 \n" + + // Need to divide, but can't downshift as the the value + // isn't a power of 2. So multiply by 65536 / n + // and take the upper 16 bits. + "vqrdmulh.s16 q0, q0, q15 \n" + + // Align for table lookup, vtbl requires registers to + // be adjacent + "vmov.u8 d2, d4 \n" + + "vtbl.u8 d3, {d0, d1, d2}, d28 \n" + "vtbl.u8 d4, {d0, d1, d2}, d29 \n" + + MEMACCESS(1) + "vst1.8 {d3}, [%1]! \n" + MEMACCESS(1) + "vst1.32 {d4[0]}, [%1]! \n" + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width), // %2 + "+r"(src_stride), // %3 + "+r"(src_ptr1) // %4 + : "r"(&kMult38_Div6), // %5 + "r"(&kShuf38_2), // %6 + "r"(&kMult38_Div9) // %7 + : "q0", "q1", "q2", "q3", "q8", "q9", "q13", "q14", "q15", "memory", "cc" + ); +} + +// 32x2 -> 12x1 +void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile ( + MEMACCESS(4) + "vld1.16 {q13}, [%4] \n" + MEMACCESS(5) + "vld1.8 {q14}, [%5] \n" + "add %3, %0 \n" + ".p2align 2 \n" + "1: \n" + + // d0 = 00 40 01 41 02 42 03 43 + // d1 = 10 50 11 51 12 52 13 53 + // d2 = 20 60 21 61 22 62 23 63 + // d3 = 30 70 31 71 32 72 33 73 + MEMACCESS(0) + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" + MEMACCESS(3) + "vld4.8 {d4, d5, d6, d7}, [%3]! \n" + "subs %2, %2, #12 \n" + + // Shuffle the input data around to get align the data + // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7 + // d0 = 00 10 01 11 02 12 03 13 + // d1 = 40 50 41 51 42 52 43 53 + "vtrn.u8 d0, d1 \n" + "vtrn.u8 d4, d5 \n" + + // d2 = 20 30 21 31 22 32 23 33 + // d3 = 60 70 61 71 62 72 63 73 + "vtrn.u8 d2, d3 \n" + "vtrn.u8 d6, d7 \n" + + // d0 = 00+10 01+11 02+12 03+13 + // d2 = 40+50 41+51 42+52 43+53 + "vpaddl.u8 q0, q0 \n" + "vpaddl.u8 q2, q2 \n" + + // d3 = 60+70 61+71 62+72 63+73 + "vpaddl.u8 d3, d3 \n" + "vpaddl.u8 d7, d7 \n" + + // combine source lines + "vadd.u16 q0, q2 \n" + "vadd.u16 d4, d3, d7 \n" + + // dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4 + "vqrshrn.u16 d4, q2, #2 \n" + + // Shuffle 2,3 reg around so that 2 can be added to the + // 0,1 reg and 3 can be added to the 4,5 reg. This + // requires expanding from u8 to u16 as the 0,1 and 4,5 + // registers are already expanded. Then do transposes + // to get aligned. + // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33 + "vmovl.u8 q1, d2 \n" + "vmovl.u8 q3, d6 \n" + + // combine source lines + "vadd.u16 q1, q3 \n" + + // d4 = xx 20 xx 30 xx 22 xx 32 + // d5 = xx 21 xx 31 xx 23 xx 33 + "vtrn.u32 d2, d3 \n" + + // d4 = xx 20 xx 21 xx 22 xx 23 + // d5 = xx 30 xx 31 xx 32 xx 33 + "vtrn.u16 d2, d3 \n" + + // 0+1+2, 3+4+5 + "vadd.u16 q0, q1 \n" + + // Need to divide, but can't downshift as the the value + // isn't a power of 2. So multiply by 65536 / n + // and take the upper 16 bits. + "vqrdmulh.s16 q0, q0, q13 \n" + + // Align for table lookup, vtbl requires registers to + // be adjacent + "vmov.u8 d2, d4 \n" + + "vtbl.u8 d3, {d0, d1, d2}, d28 \n" + "vtbl.u8 d4, {d0, d1, d2}, d29 \n" + + MEMACCESS(1) + "vst1.8 {d3}, [%1]! \n" + MEMACCESS(1) + "vst1.32 {d4[0]}, [%1]! \n" + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width), // %2 + "+r"(src_stride) // %3 + : "r"(&kMult38_Div6), // %4 + "r"(&kShuf38_2) // %5 + : "q0", "q1", "q2", "q3", "q13", "q14", "memory", "cc" + ); +} + +// 16x2 -> 16x1 +void ScaleFilterRows_NEON(uint8* dst_ptr, + const uint8* src_ptr, ptrdiff_t src_stride, + int dst_width, int source_y_fraction) { + asm volatile ( + "cmp %4, #0 \n" + "beq 100f \n" + "add %2, %1 \n" + "cmp %4, #64 \n" + "beq 75f \n" + "cmp %4, #128 \n" + "beq 50f \n" + "cmp %4, #192 \n" + "beq 25f \n" + + "vdup.8 d5, %4 \n" + "rsb %4, #256 \n" + "vdup.8 d4, %4 \n" + // General purpose row blend. + "1: \n" + MEMACCESS(1) + "vld1.8 {q0}, [%1]! \n" + MEMACCESS(2) + "vld1.8 {q1}, [%2]! \n" + "subs %3, %3, #16 \n" + "vmull.u8 q13, d0, d4 \n" + "vmull.u8 q14, d1, d4 \n" + "vmlal.u8 q13, d2, d5 \n" + "vmlal.u8 q14, d3, d5 \n" + "vrshrn.u16 d0, q13, #8 \n" + "vrshrn.u16 d1, q14, #8 \n" + MEMACCESS(0) + "vst1.8 {q0}, [%0]! \n" + "bgt 1b \n" + "b 99f \n" + + // Blend 25 / 75. + "25: \n" + MEMACCESS(1) + "vld1.8 {q0}, [%1]! \n" + MEMACCESS(2) + "vld1.8 {q1}, [%2]! \n" + "subs %3, %3, #16 \n" + "vrhadd.u8 q0, q1 \n" + "vrhadd.u8 q0, q1 \n" + MEMACCESS(0) + "vst1.8 {q0}, [%0]! \n" + "bgt 25b \n" + "b 99f \n" + + // Blend 50 / 50. + "50: \n" + MEMACCESS(1) + "vld1.8 {q0}, [%1]! \n" + MEMACCESS(2) + "vld1.8 {q1}, [%2]! \n" + "subs %3, %3, #16 \n" + "vrhadd.u8 q0, q1 \n" + MEMACCESS(0) + "vst1.8 {q0}, [%0]! \n" + "bgt 50b \n" + "b 99f \n" + + // Blend 75 / 25. + "75: \n" + MEMACCESS(1) + "vld1.8 {q1}, [%1]! \n" + MEMACCESS(2) + "vld1.8 {q0}, [%2]! \n" + "subs %3, %3, #16 \n" + "vrhadd.u8 q0, q1 \n" + "vrhadd.u8 q0, q1 \n" + MEMACCESS(0) + "vst1.8 {q0}, [%0]! \n" + "bgt 75b \n" + "b 99f \n" + + // Blend 100 / 0 - Copy row unchanged. + "100: \n" + MEMACCESS(1) + "vld1.8 {q0}, [%1]! \n" + "subs %3, %3, #16 \n" + MEMACCESS(0) + "vst1.8 {q0}, [%0]! \n" + "bgt 100b \n" + + "99: \n" + MEMACCESS(0) + "vst1.8 {d1[7]}, [%0] \n" + : "+r"(dst_ptr), // %0 + "+r"(src_ptr), // %1 + "+r"(src_stride), // %2 + "+r"(dst_width), // %3 + "+r"(source_y_fraction) // %4 + : + : "q0", "q1", "d4", "d5", "q13", "q14", "memory", "cc" + ); +} + +void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width) { + asm volatile ( + ".p2align 2 \n" + "1: \n" + // load even pixels into q0, odd into q1 + MEMACCESS(0) + "vld2.32 {q0, q1}, [%0]! \n" + MEMACCESS(0) + "vld2.32 {q2, q3}, [%0]! \n" + "subs %2, %2, #8 \n" // 8 processed per loop + MEMACCESS(1) + "vst1.8 {q1}, [%1]! \n" // store odd pixels + MEMACCESS(1) + "vst1.8 {q3}, [%1]! \n" + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst), // %1 + "+r"(dst_width) // %2 + : + : "memory", "cc", "q0", "q1", "q2", "q3" // Clobber List + ); +} + +void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width) { + asm volatile ( + // change the stride to row 2 pointer + "add %1, %1, %0 \n" + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. + MEMACCESS(0) + "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels. + "subs %3, %3, #8 \n" // 8 processed per loop. + "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. + "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. + "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts. + "vpaddl.u8 q3, q3 \n" // A 16 bytes -> 8 shorts. + MEMACCESS(1) + "vld4.8 {d16, d18, d20, d22}, [%1]! \n" // load 8 more ARGB pixels. + MEMACCESS(1) + "vld4.8 {d17, d19, d21, d23}, [%1]! \n" // load last 8 ARGB pixels. + "vpadal.u8 q0, q8 \n" // B 16 bytes -> 8 shorts. + "vpadal.u8 q1, q9 \n" // G 16 bytes -> 8 shorts. + "vpadal.u8 q2, q10 \n" // R 16 bytes -> 8 shorts. + "vpadal.u8 q3, q11 \n" // A 16 bytes -> 8 shorts. + "vrshrn.u16 d0, q0, #2 \n" // downshift, round and pack + "vrshrn.u16 d1, q1, #2 \n" + "vrshrn.u16 d2, q2, #2 \n" + "vrshrn.u16 d3, q3, #2 \n" + MEMACCESS(2) + "vst4.8 {d0, d1, d2, d3}, [%2]! \n" + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(src_stride), // %1 + "+r"(dst), // %2 + "+r"(dst_width) // %3 + : + : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11" + ); +} + +// Reads 4 pixels at a time. +// Alignment requirement: src_argb 4 byte aligned. +void ScaleARGBRowDownEven_NEON(const uint8* src_argb, ptrdiff_t src_stride, + int src_stepx, uint8* dst_argb, int dst_width) { + asm volatile ( + "mov r12, %3, lsl #2 \n" + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld1.32 {d0[0]}, [%0], r12 \n" + MEMACCESS(0) + "vld1.32 {d0[1]}, [%0], r12 \n" + MEMACCESS(0) + "vld1.32 {d1[0]}, [%0], r12 \n" + MEMACCESS(0) + "vld1.32 {d1[1]}, [%0], r12 \n" + "subs %2, %2, #4 \n" // 4 pixels per loop. + MEMACCESS(1) + "vst1.8 {q0}, [%1]! \n" + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(dst_width) // %2 + : "r"(src_stepx) // %3 + : "memory", "cc", "r12", "q0" + ); +} + +// Reads 4 pixels at a time. +// Alignment requirement: src_argb 4 byte aligned. +void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride, + int src_stepx, + uint8* dst_argb, int dst_width) { + asm volatile ( + "mov r12, %4, lsl #2 \n" + "add %1, %1, %0 \n" + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld1.8 {d0}, [%0], r12 \n" // Read 4 2x2 blocks -> 2x1 + MEMACCESS(1) + "vld1.8 {d1}, [%1], r12 \n" + MEMACCESS(0) + "vld1.8 {d2}, [%0], r12 \n" + MEMACCESS(1) + "vld1.8 {d3}, [%1], r12 \n" + MEMACCESS(0) + "vld1.8 {d4}, [%0], r12 \n" + MEMACCESS(1) + "vld1.8 {d5}, [%1], r12 \n" + MEMACCESS(0) + "vld1.8 {d6}, [%0], r12 \n" + MEMACCESS(1) + "vld1.8 {d7}, [%1], r12 \n" + "vaddl.u8 q0, d0, d1 \n" + "vaddl.u8 q1, d2, d3 \n" + "vaddl.u8 q2, d4, d5 \n" + "vaddl.u8 q3, d6, d7 \n" + "vswp.8 d1, d2 \n" // ab_cd -> ac_bd + "vswp.8 d5, d6 \n" // ef_gh -> eg_fh + "vadd.u16 q0, q0, q1 \n" // (a+b)_(c+d) + "vadd.u16 q2, q2, q3 \n" // (e+f)_(g+h) + "vrshrn.u16 d0, q0, #2 \n" // first 2 pixels. + "vrshrn.u16 d1, q2, #2 \n" // next 2 pixels. + "subs %3, %3, #4 \n" // 4 pixels per loop. + MEMACCESS(2) + "vst1.8 {q0}, [%2]! \n" + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(src_stride), // %1 + "+r"(dst_argb), // %2 + "+r"(dst_width) // %3 + : "r"(src_stepx) // %4 + : "memory", "cc", "r12", "q0", "q1", "q2", "q3" + ); +} + +#endif // __ARM_NEON__ + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/TMessagesProj/jni/libyuv/source/scale_neon64.cc b/TMessagesProj/jni/libyuv/source/scale_neon64.cc new file mode 100644 index 000000000..44df55c6c --- /dev/null +++ b/TMessagesProj/jni/libyuv/source/scale_neon64.cc @@ -0,0 +1,789 @@ +/* + * Copyright 2014 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/row.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// This module is for GCC Neon. +#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) +#ifdef HAS_SCALEROWDOWN2_NEON +// Read 32x1 throw away even pixels, and write 16x1. +void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width) { + asm volatile ( + "1: \n" + // load even pixels into v0, odd into v1 + MEMACCESS(0) + "ld2 {v0.16b, v1.16b}, [%0], #32 \n" + "subs %2, %2, #16 \n" // 16 processed per loop + MEMACCESS(1) + "st1 {v1.16b}, [%1], #16 \n" // store odd pixels + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst), // %1 + "+r"(dst_width) // %2 + : + : "v0", "v1" // Clobber List + ); +} +#endif //HAS_SCALEROWDOWN2_NEON + +#ifdef HAS_SCALEROWDOWN2_NEON +// Read 32x2 average down and write 16x1. +void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width) { + asm volatile ( + // change the stride to row 2 pointer + "add %1, %1, %0 \n" + "1: \n" + MEMACCESS(0) + "ld1 {v0.16b, v1.16b}, [%0], #32 \n" // load row 1 and post inc + MEMACCESS(1) + "ld1 {v2.16b, v3.16b}, [%1], #32 \n" // load row 2 and post inc + "subs %3, %3, #16 \n" // 16 processed per loop + "uaddlp v0.8h, v0.16b \n" // row 1 add adjacent + "uaddlp v1.8h, v1.16b \n" + "uadalp v0.8h, v2.16b \n" // row 2 add adjacent + row1 + "uadalp v1.8h, v3.16b \n" + "rshrn v0.8b, v0.8h, #2 \n" // downshift, round and pack + "rshrn2 v0.16b, v1.8h, #2 \n" + MEMACCESS(2) + "st1 {v0.16b}, [%2], #16 \n" + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(src_stride), // %1 + "+r"(dst), // %2 + "+r"(dst_width) // %3 + : + : "v0", "v1", "v2", "v3" // Clobber List + ); +} +#endif //HAS_SCALEROWDOWN2_NEON + +#ifdef HAS_SCALEROWDOWN4_NEON +void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile ( + "1: \n" + MEMACCESS(0) + "ld4 {v0.8b-3.8b}, [%0], #32 \n" // src line 0 + "subs %2, %2, #8 \n" // 8 processed per loop + MEMACCESS(1) + "st1 {v2.8b}, [%1], #8 \n" + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : + : "v0", "v1", "v2", "v3", "memory", "cc" + ); +} +#endif //HAS_SCALEROWDOWN4_NEON + +#ifdef HAS_SCALEROWDOWN4_NEON +void ScaleRowDown4Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + const uint8* src_ptr1 = src_ptr + src_stride; + const uint8* src_ptr2 = src_ptr + src_stride * 2; + const uint8* src_ptr3 = src_ptr + src_stride * 3; +asm volatile ( + "1: \n" + MEMACCESS(0) + "ld1 {v0.16b}, [%0], #16 \n" // load up 16x4 + MEMACCESS(3) + "ld1 {v1.16b}, [%3], #16 \n" + MEMACCESS(4) + "ld1 {v2.16b}, [%4], #16 \n" + MEMACCESS(5) + "ld1 {v3.16b}, [%5], #16 \n" + "subs %2, %2, #4 \n" + "uaddlp v0.8h, v0.16b \n" + "uadalp v0.8h, v1.16b \n" + "uadalp v0.8h, v2.16b \n" + "uadalp v0.8h, v3.16b \n" + "addp v0.8h, v0.8h, v0.8h \n" + "rshrn v0.8b, v0.8h, #4 \n" // divide by 16 w/rounding + MEMACCESS(1) + "st1 {v0.s}[0], [%1], #4 \n" + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width), // %2 + "+r"(src_ptr1), // %3 + "+r"(src_ptr2), // %4 + "+r"(src_ptr3) // %5 + : + : "v0", "v1", "v2", "v3", "memory", "cc" + ); +} +#endif //HAS_SCALEROWDOWN4_NEON + +#ifdef HAS_SCALEROWDOWN34_NEON +// Down scale from 4 to 3 pixels. Use the neon multilane read/write +// to load up the every 4th pixel into a 4 different registers. +// Point samples 32 pixels to 24 pixels. +void ScaleRowDown34_NEON(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile ( + "1: \n" + MEMACCESS(0) + "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // src line 0 + "subs %2, %2, #24 \n" + "mov v2.8b, v3.8b \n" // order v0, v1, v2 + MEMACCESS(1) + "st3 {v0.8b-v2.8b}, [%1], #24 \n" + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : + : "v0", "v1", "v2", "v3", "memory", "cc" + ); +} +#endif //HAS_SCALEROWDOWN34_NEON + +#ifdef HAS_SCALEROWDOWN34_NEON +void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile ( + "movi v20.8b, #3 \n" + "add %3, %3, %0 \n" + "1: \n" + MEMACCESS(0) + "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // src line 0 + MEMACCESS(3) + "ld4 {v4.8b-v7.8b}, [%3], #32 \n" // src line 1 + "subs %2, %2, #24 \n" + + // filter src line 0 with src line 1 + // expand chars to shorts to allow for room + // when adding lines together + "ushll v16.8h, v4.8b, #0 \n" + "ushll v17.8h, v5.8b, #0 \n" + "ushll v18.8h, v6.8b, #0 \n" + "ushll v19.8h, v7.8b, #0 \n" + + // 3 * line_0 + line_1 + "umlal v16.8h, v0.8b, v20.8b \n" + "umlal v17.8h, v1.8b, v20.8b \n" + "umlal v18.8h, v2.8b, v20.8b \n" + "umlal v19.8h, v3.8b, v20.8b \n" + + // (3 * line_0 + line_1) >> 2 + "uqrshrn v0.8b, v16.8h, #2 \n" + "uqrshrn v1.8b, v17.8h, #2 \n" + "uqrshrn v2.8b, v18.8h, #2 \n" + "uqrshrn v3.8b, v19.8h, #2 \n" + + // a0 = (src[0] * 3 + s[1] * 1) >> 2 + "ushll v16.8h, v1.8b, #0 \n" + "umlal v16.8h, v0.8b, v20.8b \n" + "uqrshrn v0.8b, v16.8h, #2 \n" + + // a1 = (src[1] * 1 + s[2] * 1) >> 1 + "urhadd v1.8b, v1.8b, v2.8b \n" + + // a2 = (src[2] * 1 + s[3] * 3) >> 2 + "ushll v16.8h, v2.8b, #0 \n" + "umlal v16.8h, v3.8b, v20.8b \n" + "uqrshrn v2.8b, v16.8h, #2 \n" + + MEMACCESS(1) + "st3 {v0.8b-v2.8b}, [%1], #24 \n" + + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width), // %2 + "+r"(src_stride) // %3 + : + : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19", + "v20", "memory", "cc" + ); +} +#endif //ScaleRowDown34_0_Box_NEON + +#ifdef HAS_SCALEROWDOWN34_NEON +void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile ( + "movi v20.8b, #3 \n" + "add %3, %3, %0 \n" + "1: \n" + MEMACCESS(0) + "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // src line 0 + MEMACCESS(3) + "ld4 {v4.8b-v7.8b}, [%3], #32 \n" // src line 1 + "subs %2, %2, #24 \n" + // average src line 0 with src line 1 + "urhadd v0.8b, v0.8b, v4.8b \n" + "urhadd v1.8b, v1.8b, v5.8b \n" + "urhadd v2.8b, v2.8b, v6.8b \n" + "urhadd v3.8b, v3.8b, v7.8b \n" + + // a0 = (src[0] * 3 + s[1] * 1) >> 2 + "ushll v4.8h, v1.8b, #0 \n" + "umlal v4.8h, v0.8b, v20.8b \n" + "uqrshrn v0.8b, v4.8h, #2 \n" + + // a1 = (src[1] * 1 + s[2] * 1) >> 1 + "urhadd v1.8b, v1.8b, v2.8b \n" + + // a2 = (src[2] * 1 + s[3] * 3) >> 2 + "ushll v4.8h, v2.8b, #0 \n" + "umlal v4.8h, v3.8b, v20.8b \n" + "uqrshrn v2.8b, v4.8h, #2 \n" + + MEMACCESS(1) + "st3 {v0.8b-v2.8b}, [%1], #24 \n" + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width), // %2 + "+r"(src_stride) // %3 + : + : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", "memory", "cc" + ); +} +#endif //HAS_SCALEROWDOWN34_NEON + +#ifdef HAS_SCALEROWDOWN38_NEON +static uvec8 kShuf38 = + { 0, 3, 6, 8, 11, 14, 16, 19, 22, 24, 27, 30, 0, 0, 0, 0 }; +static uvec8 kShuf38_2 = + { 0, 16, 32, 2, 18, 33, 4, 20, 34, 6, 22, 35, 0, 0, 0, 0 }; +static vec16 kMult38_Div6 = + { 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12, + 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12 }; +static vec16 kMult38_Div9 = + { 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18, + 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18 }; + +// 32 -> 12 +void ScaleRowDown38_NEON(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile ( + MEMACCESS(3) + "ld1 {v3.16b}, [%3] \n" + "1: \n" + MEMACCESS(0) + "ld1 {v0.16b, v1.16b}, [%0], #32 \n" + "subs %2, %2, #12 \n" + "tbl v2.16b, {v0.16b, v1.16b}, v3.16b \n" + MEMACCESS(1) + "st1 {v2.8b}, [%1], #8 \n" + MEMACCESS(1) + "st1 {v2.s}[2], [%1], #4 \n" + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"(&kShuf38) // %3 + : "v0", "v1", "v2", "v3", "memory", "cc" + ); +} + +#endif //HAS_SCALEROWDOWN38_NEON + +#ifdef HAS_SCALEROWDOWN38_NEON +// 32x3 -> 12x1 +void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + const uint8* src_ptr1 = src_ptr + src_stride * 2; + + asm volatile ( + MEMACCESS(5) + "ld1 {v29.8h}, [%5] \n" + MEMACCESS(6) + "ld1 {v30.16b}, [%6] \n" + MEMACCESS(7) + "ld1 {v31.8h}, [%7] \n" + "add %3, %3, %0 \n" + "1: \n" + + // 00 40 01 41 02 42 03 43 + // 10 50 11 51 12 52 13 53 + // 20 60 21 61 22 62 23 63 + // 30 70 31 71 32 72 33 73 + MEMACCESS(0) + "ld4 {v0.8b-v3.8b}, [%0], #32 \n" + MEMACCESS(3) + "ld4 {v4.8b-v7.8b}, [%3], #32 \n" + MEMACCESS(4) + "ld4 {v16.8b-v19.8b}, [%4], #32 \n" + "subs %2, %2, #12 \n" + + // Shuffle the input data around to get align the data + // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7 + // 00 10 01 11 02 12 03 13 + // 40 50 41 51 42 52 43 53 + "trn1 v20.8b, v0.8b, v1.8b \n" + "trn2 v21.8b, v0.8b, v1.8b \n" + "trn1 v22.8b, v4.8b, v5.8b \n" + "trn2 v23.8b, v4.8b, v5.8b \n" + "trn1 v24.8b, v16.8b, v17.8b \n" + "trn2 v25.8b, v16.8b, v17.8b \n" + + // 20 30 21 31 22 32 23 33 + // 60 70 61 71 62 72 63 73 + "trn1 v0.8b, v2.8b, v3.8b \n" + "trn2 v1.8b, v2.8b, v3.8b \n" + "trn1 v4.8b, v6.8b, v7.8b \n" + "trn2 v5.8b, v6.8b, v7.8b \n" + "trn1 v16.8b, v18.8b, v19.8b \n" + "trn2 v17.8b, v18.8b, v19.8b \n" + + // 00+10 01+11 02+12 03+13 + // 40+50 41+51 42+52 43+53 + "uaddlp v20.4h, v20.8b \n" + "uaddlp v21.4h, v21.8b \n" + "uaddlp v22.4h, v22.8b \n" + "uaddlp v23.4h, v23.8b \n" + "uaddlp v24.4h, v24.8b \n" + "uaddlp v25.4h, v25.8b \n" + + // 60+70 61+71 62+72 63+73 + "uaddlp v1.4h, v1.8b \n" + "uaddlp v5.4h, v5.8b \n" + "uaddlp v17.4h, v17.8b \n" + + // combine source lines + "add v20.4h, v20.4h, v22.4h \n" + "add v21.4h, v21.4h, v23.4h \n" + "add v20.4h, v20.4h, v24.4h \n" + "add v21.4h, v21.4h, v25.4h \n" + "add v2.4h, v1.4h, v5.4h \n" + "add v2.4h, v2.4h, v17.4h \n" + + // dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0] + // + s[6 + st * 1] + s[7 + st * 1] + // + s[6 + st * 2] + s[7 + st * 2]) / 6 + "sqrdmulh v2.8h, v2.8h, v29.8h \n" + "xtn v2.8b, v2.8h \n" + + // Shuffle 2,3 reg around so that 2 can be added to the + // 0,1 reg and 3 can be added to the 4,5 reg. This + // requires expanding from u8 to u16 as the 0,1 and 4,5 + // registers are already expanded. Then do transposes + // to get aligned. + // xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33 + "ushll v16.8h, v16.8b, #0 \n" + "uaddl v0.8h, v0.8b, v4.8b \n" + + // combine source lines + "add v0.8h, v0.8h, v16.8h \n" + + // xx 20 xx 21 xx 22 xx 23 + // xx 30 xx 31 xx 32 xx 33 + "trn1 v1.8h, v0.8h, v0.8h \n" + "trn2 v4.8h, v0.8h, v0.8h \n" + "xtn v0.4h, v1.4s \n" + "xtn v4.4h, v4.4s \n" + + // 0+1+2, 3+4+5 + "add v20.8h, v20.8h, v0.8h \n" + "add v21.8h, v21.8h, v4.8h \n" + + // Need to divide, but can't downshift as the the value + // isn't a power of 2. So multiply by 65536 / n + // and take the upper 16 bits. + "sqrdmulh v0.8h, v20.8h, v31.8h \n" + "sqrdmulh v1.8h, v21.8h, v31.8h \n" + + // Align for table lookup, vtbl requires registers to + // be adjacent + "tbl v3.16b, {v0.16b, v1.16b, v2.16b}, v30.16b \n" + + MEMACCESS(1) + "st1 {v3.8b}, [%1], #8 \n" + MEMACCESS(1) + "st1 {v3.s}[2], [%1], #4 \n" + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width), // %2 + "+r"(src_stride), // %3 + "+r"(src_ptr1) // %4 + : "r"(&kMult38_Div6), // %5 + "r"(&kShuf38_2), // %6 + "r"(&kMult38_Div9) // %7 + : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", + "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v29", + "v30", "v31", "memory", "cc" + ); +} +#endif //HAS_SCALEROWDOWN38_NEON + +#ifdef HAS_SCALEROWDOWN38_NEON +// 32x2 -> 12x1 +void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile ( + MEMACCESS(4) + "ld1 {v30.8h}, [%4] \n" + MEMACCESS(5) + "ld1 {v31.16b}, [%5] \n" + "add %3, %3, %0 \n" + "1: \n" + + // 00 40 01 41 02 42 03 43 + // 10 50 11 51 12 52 13 53 + // 20 60 21 61 22 62 23 63 + // 30 70 31 71 32 72 33 73 + MEMACCESS(0) + "ld4 {v0.8b-v3.8b}, [%0], #32 \n" + MEMACCESS(3) + "ld4 {v4.8b-v7.8b}, [%3], #32 \n" + "subs %2, %2, #12 \n" + + // Shuffle the input data around to get align the data + // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7 + // 00 10 01 11 02 12 03 13 + // 40 50 41 51 42 52 43 53 + "trn1 v16.8b, v0.8b, v1.8b \n" + "trn2 v17.8b, v0.8b, v1.8b \n" + "trn1 v18.8b, v4.8b, v5.8b \n" + "trn2 v19.8b, v4.8b, v5.8b \n" + + // 20 30 21 31 22 32 23 33 + // 60 70 61 71 62 72 63 73 + "trn1 v0.8b, v2.8b, v3.8b \n" + "trn2 v1.8b, v2.8b, v3.8b \n" + "trn1 v4.8b, v6.8b, v7.8b \n" + "trn2 v5.8b, v6.8b, v7.8b \n" + + // 00+10 01+11 02+12 03+13 + // 40+50 41+51 42+52 43+53 + "uaddlp v16.4h, v16.8b \n" + "uaddlp v17.4h, v17.8b \n" + "uaddlp v18.4h, v18.8b \n" + "uaddlp v19.4h, v19.8b \n" + + // 60+70 61+71 62+72 63+73 + "uaddlp v1.4h, v1.8b \n" + "uaddlp v5.4h, v5.8b \n" + + // combine source lines + "add v16.4h, v16.4h, v18.4h \n" + "add v17.4h, v17.4h, v19.4h \n" + "add v2.4h, v1.4h, v5.4h \n" + + // dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4 + "uqrshrn v2.8b, v2.8h, #2 \n" + + // Shuffle 2,3 reg around so that 2 can be added to the + // 0,1 reg and 3 can be added to the 4,5 reg. This + // requires expanding from u8 to u16 as the 0,1 and 4,5 + // registers are already expanded. Then do transposes + // to get aligned. + // xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33 + + // combine source lines + "uaddl v0.8h, v0.8b, v4.8b \n" + + // xx 20 xx 21 xx 22 xx 23 + // xx 30 xx 31 xx 32 xx 33 + "trn1 v1.8h, v0.8h, v0.8h \n" + "trn2 v4.8h, v0.8h, v0.8h \n" + "xtn v0.4h, v1.4s \n" + "xtn v4.4h, v4.4s \n" + + // 0+1+2, 3+4+5 + "add v16.8h, v16.8h, v0.8h \n" + "add v17.8h, v17.8h, v4.8h \n" + + // Need to divide, but can't downshift as the the value + // isn't a power of 2. So multiply by 65536 / n + // and take the upper 16 bits. + "sqrdmulh v0.8h, v16.8h, v30.8h \n" + "sqrdmulh v1.8h, v17.8h, v30.8h \n" + + // Align for table lookup, vtbl requires registers to + // be adjacent + + "tbl v3.16b, {v0.16b, v1.16b, v2.16b}, v31.16b \n" + + MEMACCESS(1) + "st1 {v3.8b}, [%1], #8 \n" + MEMACCESS(1) + "st1 {v3.s}[2], [%1], #4 \n" + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width), // %2 + "+r"(src_stride) // %3 + : "r"(&kMult38_Div6), // %4 + "r"(&kShuf38_2) // %5 + : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", + "v18", "v19", "v30", "v31", "memory", "cc" + ); +} +#endif //HAS_SCALEROWDOWN38_NEON + +// 16x2 -> 16x1 +void ScaleFilterRows_NEON(uint8* dst_ptr, + const uint8* src_ptr, ptrdiff_t src_stride, + int dst_width, int source_y_fraction) { + int y_fraction = 256 - source_y_fraction; + asm volatile ( + "cmp %4, #0 \n" + "beq 100f \n" + "add %2, %2, %1 \n" + "cmp %4, #64 \n" + "beq 75f \n" + "cmp %4, #128 \n" + "beq 50f \n" + "cmp %4, #192 \n" + "beq 25f \n" + + "dup v5.8b, %w4 \n" + "dup v4.8b, %w5 \n" + // General purpose row blend. + "1: \n" + MEMACCESS(1) + "ld1 {v0.16b}, [%1], #16 \n" + MEMACCESS(2) + "ld1 {v1.16b}, [%2], #16 \n" + "subs %3, %3, #16 \n" + "umull v6.8h, v0.8b, v4.8b \n" + "umull2 v7.8h, v0.16b, v4.16b \n" + "umlal v6.8h, v1.8b, v5.8b \n" + "umlal2 v7.8h, v1.16b, v5.16b \n" + "rshrn v0.8b, v6.8h, #8 \n" + "rshrn2 v0.16b, v7.8h, #8 \n" + MEMACCESS(0) + "st1 {v0.16b}, [%0], #16 \n" + "bgt 1b \n" + "b 99f \n" + + // Blend 25 / 75. + "25: \n" + MEMACCESS(1) + "ld1 {v0.16b}, [%1], #16 \n" + MEMACCESS(2) + "ld1 {v1.16b}, [%2], #16 \n" + "subs %3, %3, #16 \n" + "urhadd v0.16b, v0.16b, v1.16b \n" + "urhadd v0.16b, v0.16b, v1.16b \n" + MEMACCESS(0) + "st1 {v0.16b}, [%0], #16 \n" + "bgt 25b \n" + "b 99f \n" + + // Blend 50 / 50. + "50: \n" + MEMACCESS(1) + "ld1 {v0.16b}, [%1], #16 \n" + MEMACCESS(2) + "ld1 {v1.16b}, [%2], #16 \n" + "subs %3, %3, #16 \n" + "urhadd v0.16b, v0.16b, v1.16b \n" + MEMACCESS(0) + "st1 {v0.16b}, [%0], #16 \n" + "bgt 50b \n" + "b 99f \n" + + // Blend 75 / 25. + "75: \n" + MEMACCESS(1) + "ld1 {v1.16b}, [%1], #16 \n" + MEMACCESS(2) + "ld1 {v0.16b}, [%2], #16 \n" + "subs %3, %3, #16 \n" + "urhadd v0.16b, v0.16b, v1.16b \n" + "urhadd v0.16b, v0.16b, v1.16b \n" + MEMACCESS(0) + "st1 {v0.16b}, [%0], #16 \n" + "bgt 75b \n" + "b 99f \n" + + // Blend 100 / 0 - Copy row unchanged. + "100: \n" + MEMACCESS(1) + "ld1 {v0.16b}, [%1], #16 \n" + "subs %3, %3, #16 \n" + MEMACCESS(0) + "st1 {v0.16b}, [%0], #16 \n" + "bgt 100b \n" + + "99: \n" + MEMACCESS(0) + "st1 {v0.b}[15], [%0] \n" + : "+r"(dst_ptr), // %0 + "+r"(src_ptr), // %1 + "+r"(src_stride), // %2 + "+r"(dst_width), // %3 + "+r"(source_y_fraction),// %4 + "+r"(y_fraction) // %5 + : + : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "memory", "cc" + ); +} + +#ifdef HAS_SCALEARGBROWDOWN2_NEON +void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width) { + asm volatile ( + "1: \n" + // load even pixels into q0, odd into q1 + MEMACCESS (0) + "ld2 {v0.4s, v1.4s}, [%0], #32 \n" + MEMACCESS (0) + "ld2 {v2.4s, v3.4s}, [%0], #32 \n" + "subs %2, %2, #8 \n" // 8 processed per loop + MEMACCESS (1) + "st1 {v1.16b}, [%1], #16 \n" // store odd pixels + MEMACCESS (1) + "st1 {v3.16b}, [%1], #16 \n" + "bgt 1b \n" + : "+r" (src_ptr), // %0 + "+r" (dst), // %1 + "+r" (dst_width) // %2 + : + : "memory", "cc", "v0", "v1", "v2", "v3" // Clobber List + ); +} +#endif //HAS_SCALEARGBROWDOWN2_NEON + +#ifdef HAS_SCALEARGBROWDOWN2_NEON +void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width) { + asm volatile ( + // change the stride to row 2 pointer + "add %1, %1, %0 \n" + "1: \n" + MEMACCESS (0) + "ld4 {v0.16b - v3.16b}, [%0], #64 \n" // load 8 ARGB pixels. + "subs %3, %3, #8 \n" // 8 processed per loop. + "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. + "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. + "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. + "uaddlp v3.8h, v3.16b \n" // A 16 bytes -> 8 shorts. + MEMACCESS (1) + "ld4 {v16.16b - v19.16b}, [%1], #64 \n" // load 8 more ARGB pixels. + "uadalp v0.8h, v16.16b \n" // B 16 bytes -> 8 shorts. + "uadalp v1.8h, v17.16b \n" // G 16 bytes -> 8 shorts. + "uadalp v2.8h, v18.16b \n" // R 16 bytes -> 8 shorts. + "uadalp v3.8h, v19.16b \n" // A 16 bytes -> 8 shorts. + "rshrn v0.8b, v0.8h, #2 \n" // downshift, round and pack + "rshrn v1.8b, v1.8h, #2 \n" + "rshrn v2.8b, v2.8h, #2 \n" + "rshrn v3.8b, v3.8h, #2 \n" + MEMACCESS (2) + "st4 {v0.8b - v3.8b}, [%2], #32 \n" + "bgt 1b \n" + : "+r" (src_ptr), // %0 + "+r" (src_stride), // %1 + "+r" (dst), // %2 + "+r" (dst_width) // %3 + : + : "memory", "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19" + ); +} +#endif //HAS_SCALEARGBROWDOWN2_NEON + +#ifdef HAS_SCALEARGBROWDOWNEVEN_NEON +// Reads 4 pixels at a time. +// Alignment requirement: src_argb 4 byte aligned. +void ScaleARGBRowDownEven_NEON(const uint8* src_argb, ptrdiff_t src_stride, + int src_stepx, uint8* dst_argb, int dst_width) { + asm volatile ( + "1: \n" + MEMACCESS(0) + "ld1 {v0.s}[0], [%0], %3 \n" + MEMACCESS(0) + "ld1 {v0.s}[1], [%0], %3 \n" + MEMACCESS(0) + "ld1 {v0.s}[2], [%0], %3 \n" + MEMACCESS(0) + "ld1 {v0.s}[3], [%0], %3 \n" + "subs %2, %2, #4 \n" // 4 pixels per loop. + MEMACCESS(1) + "st1 {v0.16b}, [%1], #16 \n" + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(dst_width) // %2 + : "r"(src_stepx * 4) // %3 + : "memory", "cc", "v0" + ); +} +#endif //HAS_SCALEARGBROWDOWNEVEN_NEON + +#ifdef HAS_SCALEARGBROWDOWNEVEN_NEON +// Reads 4 pixels at a time. +// Alignment requirement: src_argb 4 byte aligned. +// TODO, might be worth another optimization pass in future. +// It could be upgraded to 8 pixels at a time to start with. +void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride, + int src_stepx, + uint8* dst_argb, int dst_width) { + asm volatile ( + "add %1, %1, %0 \n" + "1: \n" + MEMACCESS(0) + "ld1 {v0.8b}, [%0], %4 \n" // Read 4 2x2 blocks -> 2x1 + MEMACCESS(1) + "ld1 {v1.8b}, [%1], %4 \n" + MEMACCESS(0) + "ld1 {v2.8b}, [%0], %4 \n" + MEMACCESS(1) + "ld1 {v3.8b}, [%1], %4 \n" + MEMACCESS(0) + "ld1 {v4.8b}, [%0], %4 \n" + MEMACCESS(1) + "ld1 {v5.8b}, [%1], %4 \n" + MEMACCESS(0) + "ld1 {v6.8b}, [%0], %4 \n" + MEMACCESS(1) + "ld1 {v7.8b}, [%1], %4 \n" + "uaddl v0.8h, v0.8b, v1.8b \n" + "uaddl v2.8h, v2.8b, v3.8b \n" + "uaddl v4.8h, v4.8b, v5.8b \n" + "uaddl v6.8h, v6.8b, v7.8b \n" + "mov v16.d[1], v0.d[1] \n" // ab_cd -> ac_bd + "mov v0.d[1], v2.d[0] \n" + "mov v2.d[0], v16.d[1] \n" + "mov v16.d[1], v4.d[1] \n" // ef_gh -> eg_fh + "mov v4.d[1], v6.d[0] \n" + "mov v6.d[0], v16.d[1] \n" + "add v0.8h, v0.8h, v2.8h \n" // (a+b)_(c+d) + "add v4.8h, v4.8h, v6.8h \n" // (e+f)_(g+h) + "rshrn v0.8b, v0.8h, #2 \n" // first 2 pixels. + "rshrn2 v0.16b, v4.8h, #2 \n" // next 2 pixels. + "subs %3, %3, #4 \n" // 4 pixels per loop. + MEMACCESS(2) + "st1 {v0.16b}, [%2], #16 \n" + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(src_stride), // %1 + "+r"(dst_argb), // %2 + "+r"(dst_width) // %3 + : "r"(src_stepx * 4) // %4 + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16" + ); +} +#endif // HAS_SCALEARGBROWDOWNEVEN_NEON +#endif // __aarch64__ + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/TMessagesProj/jni/libyuv/source/scale_posix.cc b/TMessagesProj/jni/libyuv/source/scale_posix.cc new file mode 100644 index 000000000..352e66782 --- /dev/null +++ b/TMessagesProj/jni/libyuv/source/scale_posix.cc @@ -0,0 +1,1315 @@ +/* + * Copyright 2013 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/row.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// This module is for GCC x86 and x64. +#if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__)) + +// Offsets for source bytes 0 to 9 +static uvec8 kShuf0 = + { 0, 1, 3, 4, 5, 7, 8, 9, 128, 128, 128, 128, 128, 128, 128, 128 }; + +// Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12. +static uvec8 kShuf1 = + { 3, 4, 5, 7, 8, 9, 11, 12, 128, 128, 128, 128, 128, 128, 128, 128 }; + +// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31. +static uvec8 kShuf2 = + { 5, 7, 8, 9, 11, 12, 13, 15, 128, 128, 128, 128, 128, 128, 128, 128 }; + +// Offsets for source bytes 0 to 10 +static uvec8 kShuf01 = + { 0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10 }; + +// Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13. +static uvec8 kShuf11 = + { 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13 }; + +// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31. +static uvec8 kShuf21 = + { 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15 }; + +// Coefficients for source bytes 0 to 10 +static uvec8 kMadd01 = + { 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2 }; + +// Coefficients for source bytes 10 to 21 +static uvec8 kMadd11 = + { 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1 }; + +// Coefficients for source bytes 21 to 31 +static uvec8 kMadd21 = + { 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3 }; + +// Coefficients for source bytes 21 to 31 +static vec16 kRound34 = + { 2, 2, 2, 2, 2, 2, 2, 2 }; + +static uvec8 kShuf38a = + { 0, 3, 6, 8, 11, 14, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }; + +static uvec8 kShuf38b = + { 128, 128, 128, 128, 128, 128, 0, 3, 6, 8, 11, 14, 128, 128, 128, 128 }; + +// Arrange words 0,3,6 into 0,1,2 +static uvec8 kShufAc = + { 0, 1, 6, 7, 12, 13, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }; + +// Arrange words 0,3,6 into 3,4,5 +static uvec8 kShufAc3 = + { 128, 128, 128, 128, 128, 128, 0, 1, 6, 7, 12, 13, 128, 128, 128, 128 }; + +// Scaling values for boxes of 3x3 and 2x3 +static uvec16 kScaleAc33 = + { 65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9, 65536 / 9, 65536 / 6, 0, 0 }; + +// Arrange first value for pixels 0,1,2,3,4,5 +static uvec8 kShufAb0 = + { 0, 128, 3, 128, 6, 128, 8, 128, 11, 128, 14, 128, 128, 128, 128, 128 }; + +// Arrange second value for pixels 0,1,2,3,4,5 +static uvec8 kShufAb1 = + { 1, 128, 4, 128, 7, 128, 9, 128, 12, 128, 15, 128, 128, 128, 128, 128 }; + +// Arrange third value for pixels 0,1,2,3,4,5 +static uvec8 kShufAb2 = + { 2, 128, 5, 128, 128, 128, 10, 128, 13, 128, 128, 128, 128, 128, 128, 128 }; + +// Scaling values for boxes of 3x2 and 2x2 +static uvec16 kScaleAb2 = + { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 }; + +// GCC versions of row functions are verbatim conversions from Visual C. +// Generated using gcc disassembly on Visual C object file: +// objdump -D yuvscaler.obj >yuvscaler.txt + +void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile ( + LABELALIGN + "1: \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "lea " MEMLEA(0x20,0) ",%0 \n" + "psrlw $0x8,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqa %%xmm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x10,1) ",%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1" +#endif + ); +} + +void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile ( + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrlw $0x8,%%xmm5 \n" + + LABELALIGN + "1: \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "movdqa " MEMACCESS2(0x10, 0) ",%%xmm1 \n" + "lea " MEMLEA(0x20,0) ",%0 \n" + "movdqa %%xmm0,%%xmm2 \n" + "psrlw $0x8,%%xmm0 \n" + "movdqa %%xmm1,%%xmm3 \n" + "psrlw $0x8,%%xmm1 \n" + "pand %%xmm5,%%xmm2 \n" + "pand %%xmm5,%%xmm3 \n" + "pavgw %%xmm2,%%xmm0 \n" + "pavgw %%xmm3,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqa %%xmm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x10,1) ",%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm5" +#endif + ); +} + +void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile ( + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrlw $0x8,%%xmm5 \n" + + LABELALIGN + "1: \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" + MEMOPREG(movdqa,0x00,0,3,1,xmm2) // movdqa (%0,%3,1),%%xmm2 + BUNDLEALIGN + MEMOPREG(movdqa,0x10,0,3,1,xmm3) // movdqa 0x10(%0,%3,1),%%xmm3 + "lea " MEMLEA(0x20,0) ",%0 \n" + "pavgb %%xmm2,%%xmm0 \n" + "pavgb %%xmm3,%%xmm1 \n" + "movdqa %%xmm0,%%xmm2 \n" + "psrlw $0x8,%%xmm0 \n" + "movdqa %%xmm1,%%xmm3 \n" + "psrlw $0x8,%%xmm1 \n" + "pand %%xmm5,%%xmm2 \n" + "pand %%xmm5,%%xmm3 \n" + "pavgw %%xmm2,%%xmm0 \n" + "pavgw %%xmm3,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqa %%xmm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x10,1) ",%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)) // %3 + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" +#endif + ); +} + +void ScaleRowDown2_Unaligned_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile ( + LABELALIGN + "1: \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "lea " MEMLEA(0x20,0) ",%0 \n" + "psrlw $0x8,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqu %%xmm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x10,1) ",%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1" +#endif + ); +} + +void ScaleRowDown2Linear_Unaligned_SSE2(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile ( + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrlw $0x8,%%xmm5 \n" + + LABELALIGN + "1: \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "lea " MEMLEA(0x20,0) ",%0 \n" + "movdqa %%xmm0,%%xmm2 \n" + "psrlw $0x8,%%xmm0 \n" + "movdqa %%xmm1,%%xmm3 \n" + "psrlw $0x8,%%xmm1 \n" + "pand %%xmm5,%%xmm2 \n" + "pand %%xmm5,%%xmm3 \n" + "pavgw %%xmm2,%%xmm0 \n" + "pavgw %%xmm3,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqu %%xmm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x10,1) ",%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm5" +#endif + ); +} + +void ScaleRowDown2Box_Unaligned_SSE2(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile ( + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrlw $0x8,%%xmm5 \n" + + LABELALIGN + "1: \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" + MEMOPREG(movdqu,0x00,0,3,1,xmm2) // movdqu (%0,%3,1),%%xmm2 + BUNDLEALIGN + MEMOPREG(movdqu,0x10,0,3,1,xmm3) // movdqu 0x10(%0,%3,1),%%xmm3 + "lea " MEMLEA(0x20,0) ",%0 \n" + "pavgb %%xmm2,%%xmm0 \n" + "pavgb %%xmm3,%%xmm1 \n" + "movdqa %%xmm0,%%xmm2 \n" + "psrlw $0x8,%%xmm0 \n" + "movdqa %%xmm1,%%xmm3 \n" + "psrlw $0x8,%%xmm1 \n" + "pand %%xmm5,%%xmm2 \n" + "pand %%xmm5,%%xmm3 \n" + "pavgw %%xmm2,%%xmm0 \n" + "pavgw %%xmm3,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqu %%xmm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x10,1) ",%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)) // %3 + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" +#endif + ); +} + +void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile ( + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrld $0x18,%%xmm5 \n" + "pslld $0x10,%%xmm5 \n" + + LABELALIGN + "1: \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "lea " MEMLEA(0x20,0) ",%0 \n" + "pand %%xmm5,%%xmm0 \n" + "pand %%xmm5,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "psrlw $0x8,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movq %%xmm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x8,1) ",%1 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm5" +#endif + ); +} + +void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + intptr_t stridex3 = 0; + asm volatile ( + "pcmpeqb %%xmm7,%%xmm7 \n" + "psrlw $0x8,%%xmm7 \n" + "lea " MEMLEA4(0x00,4,4,2) ",%3 \n" + + LABELALIGN + "1: \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" + MEMOPREG(movdqa,0x00,0,4,1,xmm2) // movdqa (%0,%4,1),%%xmm2 + BUNDLEALIGN + MEMOPREG(movdqa,0x10,0,4,1,xmm3) // movdqa 0x10(%0,%4,1),%%xmm3 + "pavgb %%xmm2,%%xmm0 \n" + "pavgb %%xmm3,%%xmm1 \n" + MEMOPREG(movdqa,0x00,0,4,2,xmm2) // movdqa (%0,%4,2),%%xmm2 + BUNDLEALIGN + MEMOPREG(movdqa,0x10,0,4,2,xmm3) // movdqa 0x10(%0,%4,2),%%xmm3 + MEMOPREG(movdqa,0x00,0,3,1,xmm4) // movdqa (%0,%3,1),%%xmm4 + MEMOPREG(movdqa,0x10,0,3,1,xmm5) // movdqa 0x10(%0,%3,1),%%xmm5 + "lea " MEMLEA(0x20,0) ",%0 \n" + "pavgb %%xmm4,%%xmm2 \n" + "pavgb %%xmm2,%%xmm0 \n" + "pavgb %%xmm5,%%xmm3 \n" + "pavgb %%xmm3,%%xmm1 \n" + "movdqa %%xmm0,%%xmm2 \n" + "psrlw $0x8,%%xmm0 \n" + "movdqa %%xmm1,%%xmm3 \n" + "psrlw $0x8,%%xmm1 \n" + "pand %%xmm7,%%xmm2 \n" + "pand %%xmm7,%%xmm3 \n" + "pavgw %%xmm2,%%xmm0 \n" + "pavgw %%xmm3,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqa %%xmm0,%%xmm2 \n" + "psrlw $0x8,%%xmm0 \n" + "pand %%xmm7,%%xmm2 \n" + "pavgw %%xmm2,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movq %%xmm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x8,1) ",%1 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width), // %2 + "+r"(stridex3) // %3 + : "r"((intptr_t)(src_stride)) // %4 + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm7" +#endif + ); +} + +void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile ( + "movdqa %0,%%xmm3 \n" + "movdqa %1,%%xmm4 \n" + "movdqa %2,%%xmm5 \n" + : + : "m"(kShuf0), // %0 + "m"(kShuf1), // %1 + "m"(kShuf2) // %2 + ); + asm volatile ( + LABELALIGN + "1: \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "movdqa " MEMACCESS2(0x10,0) ",%%xmm2 \n" + "lea " MEMLEA(0x20,0) ",%0 \n" + "movdqa %%xmm2,%%xmm1 \n" + "palignr $0x8,%%xmm0,%%xmm1 \n" + "pshufb %%xmm3,%%xmm0 \n" + "pshufb %%xmm4,%%xmm1 \n" + "pshufb %%xmm5,%%xmm2 \n" + "movq %%xmm0," MEMACCESS(1) " \n" + "movq %%xmm1," MEMACCESS2(0x8,1) " \n" + "movq %%xmm2," MEMACCESS2(0x10,1) " \n" + "lea " MEMLEA(0x18,1) ",%1 \n" + "sub $0x18,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" +#endif + ); +} + +void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile ( + "movdqa %0,%%xmm2 \n" // kShuf01 + "movdqa %1,%%xmm3 \n" // kShuf11 + "movdqa %2,%%xmm4 \n" // kShuf21 + : + : "m"(kShuf01), // %0 + "m"(kShuf11), // %1 + "m"(kShuf21) // %2 + ); + asm volatile ( + "movdqa %0,%%xmm5 \n" // kMadd01 + "movdqa %1,%%xmm0 \n" // kMadd11 + "movdqa %2,%%xmm1 \n" // kRound34 + : + : "m"(kMadd01), // %0 + "m"(kMadd11), // %1 + "m"(kRound34) // %2 + ); + asm volatile ( + LABELALIGN + "1: \n" + "movdqa " MEMACCESS(0) ",%%xmm6 \n" + MEMOPREG(movdqa,0x00,0,3,1,xmm7) // movdqa (%0,%3),%%xmm7 + "pavgb %%xmm7,%%xmm6 \n" + "pshufb %%xmm2,%%xmm6 \n" + "pmaddubsw %%xmm5,%%xmm6 \n" + "paddsw %%xmm1,%%xmm6 \n" + "psrlw $0x2,%%xmm6 \n" + "packuswb %%xmm6,%%xmm6 \n" + "movq %%xmm6," MEMACCESS(1) " \n" + "movdqu " MEMACCESS2(0x8,0) ",%%xmm6 \n" + MEMOPREG(movdqu,0x8,0,3,1,xmm7) // movdqu 0x8(%0,%3),%%xmm7 + "pavgb %%xmm7,%%xmm6 \n" + "pshufb %%xmm3,%%xmm6 \n" + "pmaddubsw %%xmm0,%%xmm6 \n" + "paddsw %%xmm1,%%xmm6 \n" + "psrlw $0x2,%%xmm6 \n" + "packuswb %%xmm6,%%xmm6 \n" + "movq %%xmm6," MEMACCESS2(0x8,1) " \n" + "movdqa " MEMACCESS2(0x10,0) ",%%xmm6 \n" + BUNDLEALIGN + MEMOPREG(movdqa,0x10,0,3,1,xmm7) // movdqa 0x10(%0,%3),%%xmm7 + "lea " MEMLEA(0x20,0) ",%0 \n" + "pavgb %%xmm7,%%xmm6 \n" + "pshufb %%xmm4,%%xmm6 \n" + "pmaddubsw %4,%%xmm6 \n" + "paddsw %%xmm1,%%xmm6 \n" + "psrlw $0x2,%%xmm6 \n" + "packuswb %%xmm6,%%xmm6 \n" + "movq %%xmm6," MEMACCESS2(0x10,1) " \n" + "lea " MEMLEA(0x18,1) ",%1 \n" + "sub $0x18,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)), // %3 + "m"(kMadd21) // %4 + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" +#endif + ); +} + +void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile ( + "movdqa %0,%%xmm2 \n" // kShuf01 + "movdqa %1,%%xmm3 \n" // kShuf11 + "movdqa %2,%%xmm4 \n" // kShuf21 + : + : "m"(kShuf01), // %0 + "m"(kShuf11), // %1 + "m"(kShuf21) // %2 + ); + asm volatile ( + "movdqa %0,%%xmm5 \n" // kMadd01 + "movdqa %1,%%xmm0 \n" // kMadd11 + "movdqa %2,%%xmm1 \n" // kRound34 + : + : "m"(kMadd01), // %0 + "m"(kMadd11), // %1 + "m"(kRound34) // %2 + ); + + asm volatile ( + LABELALIGN + "1: \n" + "movdqa " MEMACCESS(0) ",%%xmm6 \n" + MEMOPREG(movdqa,0x00,0,3,1,xmm7) // movdqa (%0,%3,1),%%xmm7 + "pavgb %%xmm6,%%xmm7 \n" + "pavgb %%xmm7,%%xmm6 \n" + "pshufb %%xmm2,%%xmm6 \n" + "pmaddubsw %%xmm5,%%xmm6 \n" + "paddsw %%xmm1,%%xmm6 \n" + "psrlw $0x2,%%xmm6 \n" + "packuswb %%xmm6,%%xmm6 \n" + "movq %%xmm6," MEMACCESS(1) " \n" + "movdqu " MEMACCESS2(0x8,0) ",%%xmm6 \n" + MEMOPREG(movdqu,0x8,0,3,1,xmm7) // movdqu 0x8(%0,%3,1),%%xmm7 + "pavgb %%xmm6,%%xmm7 \n" + "pavgb %%xmm7,%%xmm6 \n" + "pshufb %%xmm3,%%xmm6 \n" + "pmaddubsw %%xmm0,%%xmm6 \n" + "paddsw %%xmm1,%%xmm6 \n" + "psrlw $0x2,%%xmm6 \n" + "packuswb %%xmm6,%%xmm6 \n" + "movq %%xmm6," MEMACCESS2(0x8,1) " \n" + "movdqa " MEMACCESS2(0x10,0) ",%%xmm6 \n" + MEMOPREG(movdqa,0x10,0,3,1,xmm7) // movdqa 0x10(%0,%3,1),%%xmm7 + "lea " MEMLEA(0x20,0) ",%0 \n" + "pavgb %%xmm6,%%xmm7 \n" + "pavgb %%xmm7,%%xmm6 \n" + "pshufb %%xmm4,%%xmm6 \n" + "pmaddubsw %4,%%xmm6 \n" + "paddsw %%xmm1,%%xmm6 \n" + "psrlw $0x2,%%xmm6 \n" + "packuswb %%xmm6,%%xmm6 \n" + "movq %%xmm6," MEMACCESS2(0x10,1) " \n" + "lea " MEMLEA(0x18,1) ",%1 \n" + "sub $0x18,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)), // %3 + "m"(kMadd21) // %4 + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" +#endif + ); +} + +void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile ( + "movdqa %3,%%xmm4 \n" + "movdqa %4,%%xmm5 \n" + + LABELALIGN + "1: \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "lea " MEMLEA(0x20,0) ",%0 \n" + "pshufb %%xmm4,%%xmm0 \n" + "pshufb %%xmm5,%%xmm1 \n" + "paddusb %%xmm1,%%xmm0 \n" + "movq %%xmm0," MEMACCESS(1) " \n" + "movhlps %%xmm0,%%xmm1 \n" + "movd %%xmm1," MEMACCESS2(0x8,1) " \n" + "lea " MEMLEA(0xc,1) ",%1 \n" + "sub $0xc,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "m"(kShuf38a), // %3 + "m"(kShuf38b) // %4 + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm4", "xmm5" +#endif + ); +} + +void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile ( + "movdqa %0,%%xmm2 \n" + "movdqa %1,%%xmm3 \n" + "movdqa %2,%%xmm4 \n" + "movdqa %3,%%xmm5 \n" + : + : "m"(kShufAb0), // %0 + "m"(kShufAb1), // %1 + "m"(kShufAb2), // %2 + "m"(kScaleAb2) // %3 + ); + asm volatile ( + LABELALIGN + "1: \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + MEMOPREG(pavgb,0x00,0,3,1,xmm0) // pavgb (%0,%3,1),%%xmm0 + "lea " MEMLEA(0x10,0) ",%0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "pshufb %%xmm2,%%xmm1 \n" + "movdqa %%xmm0,%%xmm6 \n" + "pshufb %%xmm3,%%xmm6 \n" + "paddusw %%xmm6,%%xmm1 \n" + "pshufb %%xmm4,%%xmm0 \n" + "paddusw %%xmm0,%%xmm1 \n" + "pmulhuw %%xmm5,%%xmm1 \n" + "packuswb %%xmm1,%%xmm1 \n" + "sub $0x6,%2 \n" + "movd %%xmm1," MEMACCESS(1) " \n" + "psrlq $0x10,%%xmm1 \n" + "movd %%xmm1," MEMACCESS2(0x2,1) " \n" + "lea " MEMLEA(0x6,1) ",%1 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)) // %3 + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" +#endif + ); +} + +void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile ( + "movdqa %0,%%xmm2 \n" + "movdqa %1,%%xmm3 \n" + "movdqa %2,%%xmm4 \n" + "pxor %%xmm5,%%xmm5 \n" + : + : "m"(kShufAc), // %0 + "m"(kShufAc3), // %1 + "m"(kScaleAc33) // %2 + ); + asm volatile ( + LABELALIGN + "1: \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + MEMOPREG(movdqa,0x00,0,3,1,xmm6) // movdqa (%0,%3,1),%%xmm6 + "movhlps %%xmm0,%%xmm1 \n" + "movhlps %%xmm6,%%xmm7 \n" + "punpcklbw %%xmm5,%%xmm0 \n" + "punpcklbw %%xmm5,%%xmm1 \n" + "punpcklbw %%xmm5,%%xmm6 \n" + "punpcklbw %%xmm5,%%xmm7 \n" + "paddusw %%xmm6,%%xmm0 \n" + "paddusw %%xmm7,%%xmm1 \n" + MEMOPREG(movdqa,0x00,0,3,2,xmm6) // movdqa (%0,%3,2),%%xmm6 + "lea " MEMLEA(0x10,0) ",%0 \n" + "movhlps %%xmm6,%%xmm7 \n" + "punpcklbw %%xmm5,%%xmm6 \n" + "punpcklbw %%xmm5,%%xmm7 \n" + "paddusw %%xmm6,%%xmm0 \n" + "paddusw %%xmm7,%%xmm1 \n" + "movdqa %%xmm0,%%xmm6 \n" + "psrldq $0x2,%%xmm0 \n" + "paddusw %%xmm0,%%xmm6 \n" + "psrldq $0x2,%%xmm0 \n" + "paddusw %%xmm0,%%xmm6 \n" + "pshufb %%xmm2,%%xmm6 \n" + "movdqa %%xmm1,%%xmm7 \n" + "psrldq $0x2,%%xmm1 \n" + "paddusw %%xmm1,%%xmm7 \n" + "psrldq $0x2,%%xmm1 \n" + "paddusw %%xmm1,%%xmm7 \n" + "pshufb %%xmm3,%%xmm7 \n" + "paddusw %%xmm7,%%xmm6 \n" + "pmulhuw %%xmm4,%%xmm6 \n" + "packuswb %%xmm6,%%xmm6 \n" + "sub $0x6,%2 \n" + "movd %%xmm6," MEMACCESS(1) " \n" + "psrlq $0x10,%%xmm6 \n" + "movd %%xmm6," MEMACCESS2(0x2,1) " \n" + "lea " MEMLEA(0x6,1) ",%1 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)) // %3 + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" +#endif + ); +} + +void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, + uint16* dst_ptr, int src_width, int src_height) { + int tmp_height = 0; + intptr_t tmp_src = 0; + asm volatile ( + "pxor %%xmm4,%%xmm4 \n" + "sub $0x1,%5 \n" + + LABELALIGN + "1: \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "mov %0,%3 \n" + "add %6,%0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklbw %%xmm4,%%xmm0 \n" + "punpckhbw %%xmm4,%%xmm1 \n" + "mov %5,%2 \n" + "test %2,%2 \n" + "je 3f \n" + + LABELALIGN + "2: \n" + "movdqa " MEMACCESS(0) ",%%xmm2 \n" + "add %6,%0 \n" + "movdqa %%xmm2,%%xmm3 \n" + "punpcklbw %%xmm4,%%xmm2 \n" + "punpckhbw %%xmm4,%%xmm3 \n" + "paddusw %%xmm2,%%xmm0 \n" + "paddusw %%xmm3,%%xmm1 \n" + "sub $0x1,%2 \n" + "jg 2b \n" + + LABELALIGN + "3: \n" + "movdqa %%xmm0," MEMACCESS(1) " \n" + "movdqa %%xmm1," MEMACCESS2(0x10,1) " \n" + "lea " MEMLEA(0x10,3) ",%0 \n" + "lea " MEMLEA(0x20,1) ",%1 \n" + "sub $0x10,%4 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(tmp_height), // %2 + "+r"(tmp_src), // %3 + "+r"(src_width), // %4 + "+rm"(src_height) // %5 + : "rm"((intptr_t)(src_stride)) // %6 + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4" +#endif + ); +} + +// Bilinear column filtering. SSSE3 version. +void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr, + int dst_width, int x, int dx) { + intptr_t x0 = 0, x1 = 0, temp_pixel = 0; + asm volatile ( + "movd %6,%%xmm2 \n" + "movd %7,%%xmm3 \n" + "movl $0x04040000,%k2 \n" + "movd %k2,%%xmm5 \n" + "pcmpeqb %%xmm6,%%xmm6 \n" + "psrlw $0x9,%%xmm6 \n" + "pextrw $0x1,%%xmm2,%k3 \n" + "subl $0x2,%5 \n" + "jl 29f \n" + "movdqa %%xmm2,%%xmm0 \n" + "paddd %%xmm3,%%xmm0 \n" + "punpckldq %%xmm0,%%xmm2 \n" + "punpckldq %%xmm3,%%xmm3 \n" + "paddd %%xmm3,%%xmm3 \n" + "pextrw $0x3,%%xmm2,%k4 \n" + + LABELALIGN + "2: \n" + "movdqa %%xmm2,%%xmm1 \n" + "paddd %%xmm3,%%xmm2 \n" + MEMOPARG(movzwl,0x00,1,3,1,k2) // movzwl (%1,%3,1),%k2 + "movd %k2,%%xmm0 \n" + "psrlw $0x9,%%xmm1 \n" + BUNDLEALIGN + MEMOPARG(movzwl,0x00,1,4,1,k2) // movzwl (%1,%4,1),%k2 + "movd %k2,%%xmm4 \n" + "pshufb %%xmm5,%%xmm1 \n" + "punpcklwd %%xmm4,%%xmm0 \n" + "pxor %%xmm6,%%xmm1 \n" + "pmaddubsw %%xmm1,%%xmm0 \n" + "pextrw $0x1,%%xmm2,%k3 \n" + "pextrw $0x3,%%xmm2,%k4 \n" + "psrlw $0x7,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movd %%xmm0,%k2 \n" + "mov %w2," MEMACCESS(0) " \n" + "lea " MEMLEA(0x2,0) ",%0 \n" + "sub $0x2,%5 \n" + "jge 2b \n" + + LABELALIGN + "29: \n" + "addl $0x1,%5 \n" + "jl 99f \n" + MEMOPARG(movzwl,0x00,1,3,1,k2) // movzwl (%1,%3,1),%k2 + "movd %k2,%%xmm0 \n" + "psrlw $0x9,%%xmm2 \n" + "pshufb %%xmm5,%%xmm2 \n" + "pxor %%xmm6,%%xmm2 \n" + "pmaddubsw %%xmm2,%%xmm0 \n" + "psrlw $0x7,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movd %%xmm0,%k2 \n" + "mov %b2," MEMACCESS(0) " \n" + "99: \n" + : "+r"(dst_ptr), // %0 + "+r"(src_ptr), // %1 + "+a"(temp_pixel), // %2 + "+r"(x0), // %3 + "+r"(x1), // %4 + "+rm"(dst_width) // %5 + : "rm"(x), // %6 + "rm"(dx) // %7 + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" +#endif + ); +} + +// Reads 4 pixels, duplicates them and writes 8 pixels. +// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned. +void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr, + int dst_width, int x, int dx) { + asm volatile ( + LABELALIGN + "1: \n" + "movdqa " MEMACCESS(1) ",%%xmm0 \n" + "lea " MEMLEA(0x10,1) ",%1 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklbw %%xmm0,%%xmm0 \n" + "punpckhbw %%xmm1,%%xmm1 \n" + "sub $0x20,%2 \n" + "movdqa %%xmm0," MEMACCESS(0) " \n" + "movdqa %%xmm1," MEMACCESS2(0x10,0) " \n" + "lea " MEMLEA(0x20,0) ",%0 \n" + "jg 1b \n" + + : "+r"(dst_ptr), // %0 + "+r"(src_ptr), // %1 + "+r"(dst_width) // %2 + : + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1" +#endif + ); +} + +void ScaleARGBRowDown2_SSE2(const uint8* src_argb, + ptrdiff_t src_stride, + uint8* dst_argb, int dst_width) { + asm volatile ( + LABELALIGN + "1: \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "lea " MEMLEA(0x20,0) ",%0 \n" + "shufps $0xdd,%%xmm1,%%xmm0 \n" + "sub $0x4,%2 \n" + "movdqa %%xmm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x10,1) ",%1 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(dst_width) // %2 + : + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1" +#endif + ); +} + +void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb, + ptrdiff_t src_stride, + uint8* dst_argb, int dst_width) { + asm volatile ( + LABELALIGN + "1: \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "lea " MEMLEA(0x20,0) ",%0 \n" + "movdqa %%xmm0,%%xmm2 \n" + "shufps $0x88,%%xmm1,%%xmm0 \n" + "shufps $0xdd,%%xmm1,%%xmm2 \n" + "pavgb %%xmm2,%%xmm0 \n" + "sub $0x4,%2 \n" + "movdqa %%xmm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x10,1) ",%1 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(dst_width) // %2 + : + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1" +#endif + ); +} + +void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb, + ptrdiff_t src_stride, + uint8* dst_argb, int dst_width) { + asm volatile ( + LABELALIGN + "1: \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" + BUNDLEALIGN + MEMOPREG(movdqa,0x00,0,3,1,xmm2) // movdqa (%0,%3,1),%%xmm2 + MEMOPREG(movdqa,0x10,0,3,1,xmm3) // movdqa 0x10(%0,%3,1),%%xmm3 + "lea " MEMLEA(0x20,0) ",%0 \n" + "pavgb %%xmm2,%%xmm0 \n" + "pavgb %%xmm3,%%xmm1 \n" + "movdqa %%xmm0,%%xmm2 \n" + "shufps $0x88,%%xmm1,%%xmm0 \n" + "shufps $0xdd,%%xmm1,%%xmm2 \n" + "pavgb %%xmm2,%%xmm0 \n" + "sub $0x4,%2 \n" + "movdqa %%xmm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x10,1) ",%1 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)) // %3 + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3" +#endif + ); +} + +// Reads 4 pixels at a time. +// Alignment requirement: dst_argb 16 byte aligned. +void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride, + int src_stepx, + uint8* dst_argb, int dst_width) { + intptr_t src_stepx_x4 = (intptr_t)(src_stepx); + intptr_t src_stepx_x12 = 0; + asm volatile ( + "lea " MEMLEA3(0x00,1,4) ",%1 \n" + "lea " MEMLEA4(0x00,1,1,2) ",%4 \n" + LABELALIGN + "1: \n" + "movd " MEMACCESS(0) ",%%xmm0 \n" + MEMOPREG(movd,0x00,0,1,1,xmm1) // movd (%0,%1,1),%%xmm1 + "punpckldq %%xmm1,%%xmm0 \n" + BUNDLEALIGN + MEMOPREG(movd,0x00,0,1,2,xmm2) // movd (%0,%1,2),%%xmm2 + MEMOPREG(movd,0x00,0,4,1,xmm3) // movd (%0,%4,1),%%xmm3 + "lea " MEMLEA4(0x00,0,1,4) ",%0 \n" + "punpckldq %%xmm3,%%xmm2 \n" + "punpcklqdq %%xmm2,%%xmm0 \n" + "sub $0x4,%3 \n" + "movdqa %%xmm0," MEMACCESS(2) " \n" + "lea " MEMLEA(0x10,2) ",%2 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(src_stepx_x4), // %1 + "+r"(dst_argb), // %2 + "+r"(dst_width), // %3 + "+r"(src_stepx_x12) // %4 + : + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3" +#endif + ); +} + +// Blends four 2x2 to 4x1. +// Alignment requirement: dst_argb 16 byte aligned. +void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb, + ptrdiff_t src_stride, int src_stepx, + uint8* dst_argb, int dst_width) { + intptr_t src_stepx_x4 = (intptr_t)(src_stepx); + intptr_t src_stepx_x12 = 0; + intptr_t row1 = (intptr_t)(src_stride); + asm volatile ( + "lea " MEMLEA3(0x00,1,4) ",%1 \n" + "lea " MEMLEA4(0x00,1,1,2) ",%4 \n" + "lea " MEMLEA4(0x00,0,5,1) ",%5 \n" + + LABELALIGN + "1: \n" + "movq " MEMACCESS(0) ",%%xmm0 \n" + MEMOPREG(movhps,0x00,0,1,1,xmm0) // movhps (%0,%1,1),%%xmm0 + MEMOPREG(movq,0x00,0,1,2,xmm1) // movq (%0,%1,2),%%xmm1 + BUNDLEALIGN + MEMOPREG(movhps,0x00,0,4,1,xmm1) // movhps (%0,%4,1),%%xmm1 + "lea " MEMLEA4(0x00,0,1,4) ",%0 \n" + "movq " MEMACCESS(5) ",%%xmm2 \n" + BUNDLEALIGN + MEMOPREG(movhps,0x00,5,1,1,xmm2) // movhps (%5,%1,1),%%xmm2 + MEMOPREG(movq,0x00,5,1,2,xmm3) // movq (%5,%1,2),%%xmm3 + MEMOPREG(movhps,0x00,5,4,1,xmm3) // movhps (%5,%4,1),%%xmm3 + "lea " MEMLEA4(0x00,5,1,4) ",%5 \n" + "pavgb %%xmm2,%%xmm0 \n" + "pavgb %%xmm3,%%xmm1 \n" + "movdqa %%xmm0,%%xmm2 \n" + "shufps $0x88,%%xmm1,%%xmm0 \n" + "shufps $0xdd,%%xmm1,%%xmm2 \n" + "pavgb %%xmm2,%%xmm0 \n" + "sub $0x4,%3 \n" + "movdqa %%xmm0," MEMACCESS(2) " \n" + "lea " MEMLEA(0x10,2) ",%2 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(src_stepx_x4), // %1 + "+r"(dst_argb), // %2 + "+rm"(dst_width), // %3 + "+r"(src_stepx_x12), // %4 + "+r"(row1) // %5 + : + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3" +#endif + ); +} + +void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb, + int dst_width, int x, int dx) { + intptr_t x0 = 0, x1 = 0; + asm volatile ( + "movd %5,%%xmm2 \n" + "movd %6,%%xmm3 \n" + "pshufd $0x0,%%xmm2,%%xmm2 \n" + "pshufd $0x11,%%xmm3,%%xmm0 \n" + "paddd %%xmm0,%%xmm2 \n" + "paddd %%xmm3,%%xmm3 \n" + "pshufd $0x5,%%xmm3,%%xmm0 \n" + "paddd %%xmm0,%%xmm2 \n" + "paddd %%xmm3,%%xmm3 \n" + "pshufd $0x0,%%xmm3,%%xmm3 \n" + "pextrw $0x1,%%xmm2,%k0 \n" + "pextrw $0x3,%%xmm2,%k1 \n" + "cmp $0x0,%4 \n" + "jl 99f \n" + "sub $0x4,%4 \n" + "jl 49f \n" + + LABELALIGN + "40: \n" + MEMOPREG(movd,0x00,3,0,4,xmm0) // movd (%3,%0,4),%%xmm0 + MEMOPREG(movd,0x00,3,1,4,xmm1) // movd (%3,%1,4),%%xmm1 + "pextrw $0x5,%%xmm2,%k0 \n" + "pextrw $0x7,%%xmm2,%k1 \n" + "paddd %%xmm3,%%xmm2 \n" + "punpckldq %%xmm1,%%xmm0 \n" + MEMOPREG(movd,0x00,3,0,4,xmm1) // movd (%3,%0,4),%%xmm1 + MEMOPREG(movd,0x00,3,1,4,xmm4) // movd (%3,%1,4),%%xmm4 + "pextrw $0x1,%%xmm2,%k0 \n" + "pextrw $0x3,%%xmm2,%k1 \n" + "punpckldq %%xmm4,%%xmm1 \n" + "punpcklqdq %%xmm1,%%xmm0 \n" + "sub $0x4,%4 \n" + "movdqu %%xmm0," MEMACCESS(2) " \n" + "lea " MEMLEA(0x10,2) ",%2 \n" + "jge 40b \n" + + "49: \n" + "test $0x2,%4 \n" + "je 29f \n" + BUNDLEALIGN + MEMOPREG(movd,0x00,3,0,4,xmm0) // movd (%3,%0,4),%%xmm0 + MEMOPREG(movd,0x00,3,1,4,xmm1) // movd (%3,%1,4),%%xmm1 + "pextrw $0x5,%%xmm2,%k0 \n" + "punpckldq %%xmm1,%%xmm0 \n" + "movq %%xmm0," MEMACCESS(2) " \n" + "lea " MEMLEA(0x8,2) ",%2 \n" + "29: \n" + "test $0x1,%4 \n" + "je 99f \n" + MEMOPREG(movd,0x00,3,0,4,xmm0) // movd (%3,%0,4),%%xmm0 + "movd %%xmm0," MEMACCESS(2) " \n" + "99: \n" + : "+a"(x0), // %0 + "+d"(x1), // %1 + "+r"(dst_argb), // %2 + "+r"(src_argb), // %3 + "+r"(dst_width) // %4 + : "rm"(x), // %5 + "rm"(dx) // %6 + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4" +#endif + ); +} + +// Reads 4 pixels, duplicates them and writes 8 pixels. +// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned. +void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb, + int dst_width, int x, int dx) { + asm volatile ( + LABELALIGN + "1: \n" + "movdqa " MEMACCESS(1) ",%%xmm0 \n" + "lea " MEMLEA(0x10,1) ",%1 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpckldq %%xmm0,%%xmm0 \n" + "punpckhdq %%xmm1,%%xmm1 \n" + "sub $0x8,%2 \n" + "movdqa %%xmm0," MEMACCESS(0) " \n" + "movdqa %%xmm1," MEMACCESS2(0x10,0) " \n" + "lea " MEMLEA(0x20,0) ",%0 \n" + "jg 1b \n" + + : "+r"(dst_argb), // %0 + "+r"(src_argb), // %1 + "+r"(dst_width) // %2 + : + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1" +#endif + ); +} + +// Shuffle table for arranging 2 pixels into pairs for pmaddubsw +static uvec8 kShuffleColARGB = { + 0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u, // bbggrraa 1st pixel + 8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u // bbggrraa 2nd pixel +}; + +// Shuffle table for duplicating 2 fractions into 8 bytes each +static uvec8 kShuffleFractions = { + 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, +}; + +// Bilinear row filtering combines 4x2 -> 4x1. SSSE3 version +void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb, + int dst_width, int x, int dx) { + intptr_t x0 = 0, x1 = 0; + asm volatile ( + "movdqa %0,%%xmm4 \n" + "movdqa %1,%%xmm5 \n" + : + : "m"(kShuffleColARGB), // %0 + "m"(kShuffleFractions) // %1 + ); + + asm volatile ( + "movd %5,%%xmm2 \n" + "movd %6,%%xmm3 \n" + "pcmpeqb %%xmm6,%%xmm6 \n" + "psrlw $0x9,%%xmm6 \n" + "pextrw $0x1,%%xmm2,%k3 \n" + "sub $0x2,%2 \n" + "jl 29f \n" + "movdqa %%xmm2,%%xmm0 \n" + "paddd %%xmm3,%%xmm0 \n" + "punpckldq %%xmm0,%%xmm2 \n" + "punpckldq %%xmm3,%%xmm3 \n" + "paddd %%xmm3,%%xmm3 \n" + "pextrw $0x3,%%xmm2,%k4 \n" + + LABELALIGN + "2: \n" + "movdqa %%xmm2,%%xmm1 \n" + "paddd %%xmm3,%%xmm2 \n" + MEMOPREG(movq,0x00,1,3,4,xmm0) // movq (%1,%3,4),%%xmm0 + "psrlw $0x9,%%xmm1 \n" + BUNDLEALIGN + MEMOPREG(movhps,0x00,1,4,4,xmm0) // movhps (%1,%4,4),%%xmm0 + "pshufb %%xmm5,%%xmm1 \n" + "pshufb %%xmm4,%%xmm0 \n" + "pxor %%xmm6,%%xmm1 \n" + "pmaddubsw %%xmm1,%%xmm0 \n" + "psrlw $0x7,%%xmm0 \n" + "pextrw $0x1,%%xmm2,%k3 \n" + "pextrw $0x3,%%xmm2,%k4 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movq %%xmm0," MEMACCESS(0) " \n" + "lea " MEMLEA(0x8,0) ",%0 \n" + "sub $0x2,%2 \n" + "jge 2b \n" + + LABELALIGN + "29: \n" + "add $0x1,%2 \n" + "jl 99f \n" + "psrlw $0x9,%%xmm2 \n" + BUNDLEALIGN + MEMOPREG(movq,0x00,1,3,4,xmm0) // movq (%1,%3,4),%%xmm0 + "pshufb %%xmm5,%%xmm2 \n" + "pshufb %%xmm4,%%xmm0 \n" + "pxor %%xmm6,%%xmm2 \n" + "pmaddubsw %%xmm2,%%xmm0 \n" + "psrlw $0x7,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movd %%xmm0," MEMACCESS(0) " \n" + + LABELALIGN + "99: \n" + : "+r"(dst_argb), // %0 + "+r"(src_argb), // %1 + "+rm"(dst_width), // %2 + "+r"(x0), // %3 + "+r"(x1) // %4 + : "rm"(x), // %5 + "rm"(dx) // %6 + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" +#endif + ); +} + +// Divide num by div and return as 16.16 fixed point result. +int FixedDiv_X86(int num, int div) { + asm volatile ( + "cdq \n" + "shld $0x10,%%eax,%%edx \n" + "shl $0x10,%%eax \n" + "idiv %1 \n" + "mov %0, %%eax \n" + : "+a"(num) // %0 + : "c"(div) // %1 + : "memory", "cc", "edx" + ); + return num; +} + +// Divide num - 1 by div - 1 and return as 16.16 fixed point result. +int FixedDiv1_X86(int num, int div) { + asm volatile ( + "cdq \n" + "shld $0x10,%%eax,%%edx \n" + "shl $0x10,%%eax \n" + "sub $0x10001,%%eax \n" + "sbb $0x0,%%edx \n" + "sub $0x1,%1 \n" + "idiv %1 \n" + "mov %0, %%eax \n" + : "+a"(num) // %0 + : "c"(div) // %1 + : "memory", "cc", "edx" + ); + return num; +} + +#endif // defined(__x86_64__) || defined(__i386__) + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/TMessagesProj/jni/libyuv/source/scale_win.cc b/TMessagesProj/jni/libyuv/source/scale_win.cc new file mode 100644 index 000000000..840b9738d --- /dev/null +++ b/TMessagesProj/jni/libyuv/source/scale_win.cc @@ -0,0 +1,1320 @@ +/* + * Copyright 2013 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/row.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// This module is for Visual C x86. +#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER) + +// Offsets for source bytes 0 to 9 +static uvec8 kShuf0 = + { 0, 1, 3, 4, 5, 7, 8, 9, 128, 128, 128, 128, 128, 128, 128, 128 }; + +// Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12. +static uvec8 kShuf1 = + { 3, 4, 5, 7, 8, 9, 11, 12, 128, 128, 128, 128, 128, 128, 128, 128 }; + +// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31. +static uvec8 kShuf2 = + { 5, 7, 8, 9, 11, 12, 13, 15, 128, 128, 128, 128, 128, 128, 128, 128 }; + +// Offsets for source bytes 0 to 10 +static uvec8 kShuf01 = + { 0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10 }; + +// Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13. +static uvec8 kShuf11 = + { 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13 }; + +// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31. +static uvec8 kShuf21 = + { 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15 }; + +// Coefficients for source bytes 0 to 10 +static uvec8 kMadd01 = + { 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2 }; + +// Coefficients for source bytes 10 to 21 +static uvec8 kMadd11 = + { 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1 }; + +// Coefficients for source bytes 21 to 31 +static uvec8 kMadd21 = + { 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3 }; + +// Coefficients for source bytes 21 to 31 +static vec16 kRound34 = + { 2, 2, 2, 2, 2, 2, 2, 2 }; + +static uvec8 kShuf38a = + { 0, 3, 6, 8, 11, 14, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }; + +static uvec8 kShuf38b = + { 128, 128, 128, 128, 128, 128, 0, 3, 6, 8, 11, 14, 128, 128, 128, 128 }; + +// Arrange words 0,3,6 into 0,1,2 +static uvec8 kShufAc = + { 0, 1, 6, 7, 12, 13, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }; + +// Arrange words 0,3,6 into 3,4,5 +static uvec8 kShufAc3 = + { 128, 128, 128, 128, 128, 128, 0, 1, 6, 7, 12, 13, 128, 128, 128, 128 }; + +// Scaling values for boxes of 3x3 and 2x3 +static uvec16 kScaleAc33 = + { 65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9, 65536 / 9, 65536 / 6, 0, 0 }; + +// Arrange first value for pixels 0,1,2,3,4,5 +static uvec8 kShufAb0 = + { 0, 128, 3, 128, 6, 128, 8, 128, 11, 128, 14, 128, 128, 128, 128, 128 }; + +// Arrange second value for pixels 0,1,2,3,4,5 +static uvec8 kShufAb1 = + { 1, 128, 4, 128, 7, 128, 9, 128, 12, 128, 15, 128, 128, 128, 128, 128 }; + +// Arrange third value for pixels 0,1,2,3,4,5 +static uvec8 kShufAb2 = + { 2, 128, 5, 128, 128, 128, 10, 128, 13, 128, 128, 128, 128, 128, 128, 128 }; + +// Scaling values for boxes of 3x2 and 2x2 +static uvec16 kScaleAb2 = + { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 }; + +// Reads 32 pixels, throws half away and writes 16 pixels. +// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned. +__declspec(naked) __declspec(align(16)) +void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + __asm { + mov eax, [esp + 4] // src_ptr + // src_stride ignored + mov edx, [esp + 12] // dst_ptr + mov ecx, [esp + 16] // dst_width + + align 4 + wloop: + movdqa xmm0, [eax] + movdqa xmm1, [eax + 16] + lea eax, [eax + 32] + psrlw xmm0, 8 // isolate odd pixels. + psrlw xmm1, 8 + packuswb xmm0, xmm1 + sub ecx, 16 + movdqa [edx], xmm0 + lea edx, [edx + 16] + jg wloop + + ret + } +} + +// Blends 32x1 rectangle to 16x1. +// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned. +__declspec(naked) __declspec(align(16)) +void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + __asm { + mov eax, [esp + 4] // src_ptr + // src_stride + mov edx, [esp + 12] // dst_ptr + mov ecx, [esp + 16] // dst_width + pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff + psrlw xmm5, 8 + + align 4 + wloop: + movdqa xmm0, [eax] + movdqa xmm1, [eax + 16] + lea eax, [eax + 32] + + movdqa xmm2, xmm0 // average columns (32 to 16 pixels) + psrlw xmm0, 8 + movdqa xmm3, xmm1 + psrlw xmm1, 8 + pand xmm2, xmm5 + pand xmm3, xmm5 + pavgw xmm0, xmm2 + pavgw xmm1, xmm3 + packuswb xmm0, xmm1 + + sub ecx, 16 + movdqa [edx], xmm0 + lea edx, [edx + 16] + jg wloop + + ret + } +} + +// Blends 32x2 rectangle to 16x1. +// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned. +__declspec(naked) __declspec(align(16)) +void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + __asm { + push esi + mov eax, [esp + 4 + 4] // src_ptr + mov esi, [esp + 4 + 8] // src_stride + mov edx, [esp + 4 + 12] // dst_ptr + mov ecx, [esp + 4 + 16] // dst_width + pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff + psrlw xmm5, 8 + + align 4 + wloop: + movdqa xmm0, [eax] + movdqa xmm1, [eax + 16] + movdqa xmm2, [eax + esi] + movdqa xmm3, [eax + esi + 16] + lea eax, [eax + 32] + pavgb xmm0, xmm2 // average rows + pavgb xmm1, xmm3 + + movdqa xmm2, xmm0 // average columns (32 to 16 pixels) + psrlw xmm0, 8 + movdqa xmm3, xmm1 + psrlw xmm1, 8 + pand xmm2, xmm5 + pand xmm3, xmm5 + pavgw xmm0, xmm2 + pavgw xmm1, xmm3 + packuswb xmm0, xmm1 + + sub ecx, 16 + movdqa [edx], xmm0 + lea edx, [edx + 16] + jg wloop + + pop esi + ret + } +} + +// Reads 32 pixels, throws half away and writes 16 pixels. +// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned. +__declspec(naked) __declspec(align(16)) +void ScaleRowDown2_Unaligned_SSE2(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + __asm { + mov eax, [esp + 4] // src_ptr + // src_stride ignored + mov edx, [esp + 12] // dst_ptr + mov ecx, [esp + 16] // dst_width + + align 4 + wloop: + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + lea eax, [eax + 32] + psrlw xmm0, 8 // isolate odd pixels. + psrlw xmm1, 8 + packuswb xmm0, xmm1 + sub ecx, 16 + movdqu [edx], xmm0 + lea edx, [edx + 16] + jg wloop + + ret + } +} + +// Blends 32x1 rectangle to 16x1. +// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned. +__declspec(naked) __declspec(align(16)) +void ScaleRowDown2Linear_Unaligned_SSE2(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + __asm { + mov eax, [esp + 4] // src_ptr + // src_stride + mov edx, [esp + 12] // dst_ptr + mov ecx, [esp + 16] // dst_width + pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff + psrlw xmm5, 8 + + align 4 + wloop: + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + lea eax, [eax + 32] + + movdqa xmm2, xmm0 // average columns (32 to 16 pixels) + psrlw xmm0, 8 + movdqa xmm3, xmm1 + psrlw xmm1, 8 + pand xmm2, xmm5 + pand xmm3, xmm5 + pavgw xmm0, xmm2 + pavgw xmm1, xmm3 + packuswb xmm0, xmm1 + + sub ecx, 16 + movdqu [edx], xmm0 + lea edx, [edx + 16] + jg wloop + + ret + } +} + +// Blends 32x2 rectangle to 16x1. +// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned. +__declspec(naked) __declspec(align(16)) +void ScaleRowDown2Box_Unaligned_SSE2(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + __asm { + push esi + mov eax, [esp + 4 + 4] // src_ptr + mov esi, [esp + 4 + 8] // src_stride + mov edx, [esp + 4 + 12] // dst_ptr + mov ecx, [esp + 4 + 16] // dst_width + pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff + psrlw xmm5, 8 + + align 4 + wloop: + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + movdqu xmm2, [eax + esi] + movdqu xmm3, [eax + esi + 16] + lea eax, [eax + 32] + pavgb xmm0, xmm2 // average rows + pavgb xmm1, xmm3 + + movdqa xmm2, xmm0 // average columns (32 to 16 pixels) + psrlw xmm0, 8 + movdqa xmm3, xmm1 + psrlw xmm1, 8 + pand xmm2, xmm5 + pand xmm3, xmm5 + pavgw xmm0, xmm2 + pavgw xmm1, xmm3 + packuswb xmm0, xmm1 + + sub ecx, 16 + movdqu [edx], xmm0 + lea edx, [edx + 16] + jg wloop + + pop esi + ret + } +} + +// Point samples 32 pixels to 8 pixels. +// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned. +__declspec(naked) __declspec(align(16)) +void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + __asm { + mov eax, [esp + 4] // src_ptr + // src_stride ignored + mov edx, [esp + 12] // dst_ptr + mov ecx, [esp + 16] // dst_width + pcmpeqb xmm5, xmm5 // generate mask 0x00ff0000 + psrld xmm5, 24 + pslld xmm5, 16 + + align 4 + wloop: + movdqa xmm0, [eax] + movdqa xmm1, [eax + 16] + lea eax, [eax + 32] + pand xmm0, xmm5 + pand xmm1, xmm5 + packuswb xmm0, xmm1 + psrlw xmm0, 8 + packuswb xmm0, xmm0 + sub ecx, 8 + movq qword ptr [edx], xmm0 + lea edx, [edx + 8] + jg wloop + + ret + } +} + +// Blends 32x4 rectangle to 8x1. +// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned. +__declspec(naked) __declspec(align(16)) +void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // src_ptr + mov esi, [esp + 8 + 8] // src_stride + mov edx, [esp + 8 + 12] // dst_ptr + mov ecx, [esp + 8 + 16] // dst_width + lea edi, [esi + esi * 2] // src_stride * 3 + pcmpeqb xmm7, xmm7 // generate mask 0x00ff00ff + psrlw xmm7, 8 + + align 4 + wloop: + movdqa xmm0, [eax] + movdqa xmm1, [eax + 16] + movdqa xmm2, [eax + esi] + movdqa xmm3, [eax + esi + 16] + pavgb xmm0, xmm2 // average rows + pavgb xmm1, xmm3 + movdqa xmm2, [eax + esi * 2] + movdqa xmm3, [eax + esi * 2 + 16] + movdqa xmm4, [eax + edi] + movdqa xmm5, [eax + edi + 16] + lea eax, [eax + 32] + pavgb xmm2, xmm4 + pavgb xmm3, xmm5 + pavgb xmm0, xmm2 + pavgb xmm1, xmm3 + + movdqa xmm2, xmm0 // average columns (32 to 16 pixels) + psrlw xmm0, 8 + movdqa xmm3, xmm1 + psrlw xmm1, 8 + pand xmm2, xmm7 + pand xmm3, xmm7 + pavgw xmm0, xmm2 + pavgw xmm1, xmm3 + packuswb xmm0, xmm1 + + movdqa xmm2, xmm0 // average columns (16 to 8 pixels) + psrlw xmm0, 8 + pand xmm2, xmm7 + pavgw xmm0, xmm2 + packuswb xmm0, xmm0 + + sub ecx, 8 + movq qword ptr [edx], xmm0 + lea edx, [edx + 8] + jg wloop + + pop edi + pop esi + ret + } +} + +// Point samples 32 pixels to 24 pixels. +// Produces three 8 byte values. For each 8 bytes, 16 bytes are read. +// Then shuffled to do the scaling. + +// Note that movdqa+palign may be better than movdqu. +// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned. +__declspec(naked) __declspec(align(16)) +void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + __asm { + mov eax, [esp + 4] // src_ptr + // src_stride ignored + mov edx, [esp + 12] // dst_ptr + mov ecx, [esp + 16] // dst_width + movdqa xmm3, kShuf0 + movdqa xmm4, kShuf1 + movdqa xmm5, kShuf2 + + align 4 + wloop: + movdqa xmm0, [eax] + movdqa xmm1, [eax + 16] + lea eax, [eax + 32] + movdqa xmm2, xmm1 + palignr xmm1, xmm0, 8 + pshufb xmm0, xmm3 + pshufb xmm1, xmm4 + pshufb xmm2, xmm5 + movq qword ptr [edx], xmm0 + movq qword ptr [edx + 8], xmm1 + movq qword ptr [edx + 16], xmm2 + lea edx, [edx + 24] + sub ecx, 24 + jg wloop + + ret + } +} + +// Blends 32x2 rectangle to 24x1 +// Produces three 8 byte values. For each 8 bytes, 16 bytes are read. +// Then shuffled to do the scaling. + +// Register usage: +// xmm0 src_row 0 +// xmm1 src_row 1 +// xmm2 shuf 0 +// xmm3 shuf 1 +// xmm4 shuf 2 +// xmm5 madd 0 +// xmm6 madd 1 +// xmm7 kRound34 + +// Note that movdqa+palign may be better than movdqu. +// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned. +__declspec(naked) __declspec(align(16)) +void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + __asm { + push esi + mov eax, [esp + 4 + 4] // src_ptr + mov esi, [esp + 4 + 8] // src_stride + mov edx, [esp + 4 + 12] // dst_ptr + mov ecx, [esp + 4 + 16] // dst_width + movdqa xmm2, kShuf01 + movdqa xmm3, kShuf11 + movdqa xmm4, kShuf21 + movdqa xmm5, kMadd01 + movdqa xmm6, kMadd11 + movdqa xmm7, kRound34 + + align 4 + wloop: + movdqa xmm0, [eax] // pixels 0..7 + movdqa xmm1, [eax + esi] + pavgb xmm0, xmm1 + pshufb xmm0, xmm2 + pmaddubsw xmm0, xmm5 + paddsw xmm0, xmm7 + psrlw xmm0, 2 + packuswb xmm0, xmm0 + movq qword ptr [edx], xmm0 + movdqu xmm0, [eax + 8] // pixels 8..15 + movdqu xmm1, [eax + esi + 8] + pavgb xmm0, xmm1 + pshufb xmm0, xmm3 + pmaddubsw xmm0, xmm6 + paddsw xmm0, xmm7 + psrlw xmm0, 2 + packuswb xmm0, xmm0 + movq qword ptr [edx + 8], xmm0 + movdqa xmm0, [eax + 16] // pixels 16..23 + movdqa xmm1, [eax + esi + 16] + lea eax, [eax + 32] + pavgb xmm0, xmm1 + pshufb xmm0, xmm4 + movdqa xmm1, kMadd21 + pmaddubsw xmm0, xmm1 + paddsw xmm0, xmm7 + psrlw xmm0, 2 + packuswb xmm0, xmm0 + sub ecx, 24 + movq qword ptr [edx + 16], xmm0 + lea edx, [edx + 24] + jg wloop + + pop esi + ret + } +} + +// Note that movdqa+palign may be better than movdqu. +// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned. +__declspec(naked) __declspec(align(16)) +void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + __asm { + push esi + mov eax, [esp + 4 + 4] // src_ptr + mov esi, [esp + 4 + 8] // src_stride + mov edx, [esp + 4 + 12] // dst_ptr + mov ecx, [esp + 4 + 16] // dst_width + movdqa xmm2, kShuf01 + movdqa xmm3, kShuf11 + movdqa xmm4, kShuf21 + movdqa xmm5, kMadd01 + movdqa xmm6, kMadd11 + movdqa xmm7, kRound34 + + align 4 + wloop: + movdqa xmm0, [eax] // pixels 0..7 + movdqa xmm1, [eax + esi] + pavgb xmm1, xmm0 + pavgb xmm0, xmm1 + pshufb xmm0, xmm2 + pmaddubsw xmm0, xmm5 + paddsw xmm0, xmm7 + psrlw xmm0, 2 + packuswb xmm0, xmm0 + movq qword ptr [edx], xmm0 + movdqu xmm0, [eax + 8] // pixels 8..15 + movdqu xmm1, [eax + esi + 8] + pavgb xmm1, xmm0 + pavgb xmm0, xmm1 + pshufb xmm0, xmm3 + pmaddubsw xmm0, xmm6 + paddsw xmm0, xmm7 + psrlw xmm0, 2 + packuswb xmm0, xmm0 + movq qword ptr [edx + 8], xmm0 + movdqa xmm0, [eax + 16] // pixels 16..23 + movdqa xmm1, [eax + esi + 16] + lea eax, [eax + 32] + pavgb xmm1, xmm0 + pavgb xmm0, xmm1 + pshufb xmm0, xmm4 + movdqa xmm1, kMadd21 + pmaddubsw xmm0, xmm1 + paddsw xmm0, xmm7 + psrlw xmm0, 2 + packuswb xmm0, xmm0 + sub ecx, 24 + movq qword ptr [edx + 16], xmm0 + lea edx, [edx+24] + jg wloop + + pop esi + ret + } +} + +// 3/8 point sampler + +// Scale 32 pixels to 12 +__declspec(naked) __declspec(align(16)) +void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + __asm { + mov eax, [esp + 4] // src_ptr + // src_stride ignored + mov edx, [esp + 12] // dst_ptr + mov ecx, [esp + 16] // dst_width + movdqa xmm4, kShuf38a + movdqa xmm5, kShuf38b + + align 4 + xloop: + movdqa xmm0, [eax] // 16 pixels -> 0,1,2,3,4,5 + movdqa xmm1, [eax + 16] // 16 pixels -> 6,7,8,9,10,11 + lea eax, [eax + 32] + pshufb xmm0, xmm4 + pshufb xmm1, xmm5 + paddusb xmm0, xmm1 + + sub ecx, 12 + movq qword ptr [edx], xmm0 // write 12 pixels + movhlps xmm1, xmm0 + movd [edx + 8], xmm1 + lea edx, [edx + 12] + jg xloop + + ret + } +} + +// Scale 16x3 pixels to 6x1 with interpolation +__declspec(naked) __declspec(align(16)) +void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + __asm { + push esi + mov eax, [esp + 4 + 4] // src_ptr + mov esi, [esp + 4 + 8] // src_stride + mov edx, [esp + 4 + 12] // dst_ptr + mov ecx, [esp + 4 + 16] // dst_width + movdqa xmm2, kShufAc + movdqa xmm3, kShufAc3 + movdqa xmm4, kScaleAc33 + pxor xmm5, xmm5 + + align 4 + xloop: + movdqa xmm0, [eax] // sum up 3 rows into xmm0/1 + movdqa xmm6, [eax + esi] + movhlps xmm1, xmm0 + movhlps xmm7, xmm6 + punpcklbw xmm0, xmm5 + punpcklbw xmm1, xmm5 + punpcklbw xmm6, xmm5 + punpcklbw xmm7, xmm5 + paddusw xmm0, xmm6 + paddusw xmm1, xmm7 + movdqa xmm6, [eax + esi * 2] + lea eax, [eax + 16] + movhlps xmm7, xmm6 + punpcklbw xmm6, xmm5 + punpcklbw xmm7, xmm5 + paddusw xmm0, xmm6 + paddusw xmm1, xmm7 + + movdqa xmm6, xmm0 // 8 pixels -> 0,1,2 of xmm6 + psrldq xmm0, 2 + paddusw xmm6, xmm0 + psrldq xmm0, 2 + paddusw xmm6, xmm0 + pshufb xmm6, xmm2 + + movdqa xmm7, xmm1 // 8 pixels -> 3,4,5 of xmm6 + psrldq xmm1, 2 + paddusw xmm7, xmm1 + psrldq xmm1, 2 + paddusw xmm7, xmm1 + pshufb xmm7, xmm3 + paddusw xmm6, xmm7 + + pmulhuw xmm6, xmm4 // divide by 9,9,6, 9,9,6 + packuswb xmm6, xmm6 + + sub ecx, 6 + movd [edx], xmm6 // write 6 pixels + psrlq xmm6, 16 + movd [edx + 2], xmm6 + lea edx, [edx + 6] + jg xloop + + pop esi + ret + } +} + +// Scale 16x2 pixels to 6x1 with interpolation +__declspec(naked) __declspec(align(16)) +void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + __asm { + push esi + mov eax, [esp + 4 + 4] // src_ptr + mov esi, [esp + 4 + 8] // src_stride + mov edx, [esp + 4 + 12] // dst_ptr + mov ecx, [esp + 4 + 16] // dst_width + movdqa xmm2, kShufAb0 + movdqa xmm3, kShufAb1 + movdqa xmm4, kShufAb2 + movdqa xmm5, kScaleAb2 + + align 4 + xloop: + movdqa xmm0, [eax] // average 2 rows into xmm0 + pavgb xmm0, [eax + esi] + lea eax, [eax + 16] + + movdqa xmm1, xmm0 // 16 pixels -> 0,1,2,3,4,5 of xmm1 + pshufb xmm1, xmm2 + movdqa xmm6, xmm0 + pshufb xmm6, xmm3 + paddusw xmm1, xmm6 + pshufb xmm0, xmm4 + paddusw xmm1, xmm0 + + pmulhuw xmm1, xmm5 // divide by 3,3,2, 3,3,2 + packuswb xmm1, xmm1 + + sub ecx, 6 + movd [edx], xmm1 // write 6 pixels + psrlq xmm1, 16 + movd [edx + 2], xmm1 + lea edx, [edx + 6] + jg xloop + + pop esi + ret + } +} + +// Reads 16xN bytes and produces 16 shorts at a time. +// TODO(fbarchard): Make this handle 4xN bytes for any width ARGB. +__declspec(naked) __declspec(align(16)) +void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, + uint16* dst_ptr, int src_width, + int src_height) { + __asm { + push esi + push edi + push ebx + push ebp + mov esi, [esp + 16 + 4] // src_ptr + mov edx, [esp + 16 + 8] // src_stride + mov edi, [esp + 16 + 12] // dst_ptr + mov ecx, [esp + 16 + 16] // dst_width + mov ebx, [esp + 16 + 20] // height + pxor xmm4, xmm4 + dec ebx + + align 4 + xloop: + // first row + movdqa xmm0, [esi] + lea eax, [esi + edx] + movdqa xmm1, xmm0 + punpcklbw xmm0, xmm4 + punpckhbw xmm1, xmm4 + lea esi, [esi + 16] + mov ebp, ebx + test ebp, ebp + je ydone + + // sum remaining rows + align 4 + yloop: + movdqa xmm2, [eax] // read 16 pixels + lea eax, [eax + edx] // advance to next row + movdqa xmm3, xmm2 + punpcklbw xmm2, xmm4 + punpckhbw xmm3, xmm4 + paddusw xmm0, xmm2 // sum 16 words + paddusw xmm1, xmm3 + sub ebp, 1 + jg yloop + + align 4 + ydone: + movdqa [edi], xmm0 + movdqa [edi + 16], xmm1 + lea edi, [edi + 32] + + sub ecx, 16 + jg xloop + + pop ebp + pop ebx + pop edi + pop esi + ret + } +} + +// Bilinear column filtering. SSSE3 version. +// TODO(fbarchard): Port to Neon +// TODO(fbarchard): Switch the following: +// xor ebx, ebx +// mov bx, word ptr [esi + eax] // 2 source x0 pixels +// To +// movzx ebx, word ptr [esi + eax] // 2 source x0 pixels +// when drmemory bug fixed. +// https://code.google.com/p/drmemory/issues/detail?id=1396 + +__declspec(naked) __declspec(align(16)) +void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr, + int dst_width, int x, int dx) { + __asm { + push ebx + push esi + push edi + mov edi, [esp + 12 + 4] // dst_ptr + mov esi, [esp + 12 + 8] // src_ptr + mov ecx, [esp + 12 + 12] // dst_width + movd xmm2, [esp + 12 + 16] // x + movd xmm3, [esp + 12 + 20] // dx + mov eax, 0x04040000 // shuffle to line up fractions with pixel. + movd xmm5, eax + pcmpeqb xmm6, xmm6 // generate 0x007f for inverting fraction. + psrlw xmm6, 9 + pextrw eax, xmm2, 1 // get x0 integer. preroll + sub ecx, 2 + jl xloop29 + + movdqa xmm0, xmm2 // x1 = x0 + dx + paddd xmm0, xmm3 + punpckldq xmm2, xmm0 // x0 x1 + punpckldq xmm3, xmm3 // dx dx + paddd xmm3, xmm3 // dx * 2, dx * 2 + pextrw edx, xmm2, 3 // get x1 integer. preroll + + // 2 Pixel loop. + align 4 + xloop2: + movdqa xmm1, xmm2 // x0, x1 fractions. + paddd xmm2, xmm3 // x += dx + movzx ebx, word ptr [esi + eax] // 2 source x0 pixels + movd xmm0, ebx + psrlw xmm1, 9 // 7 bit fractions. + movzx ebx, word ptr [esi + edx] // 2 source x1 pixels + movd xmm4, ebx + pshufb xmm1, xmm5 // 0011 + punpcklwd xmm0, xmm4 + pxor xmm1, xmm6 // 0..7f and 7f..0 + pmaddubsw xmm0, xmm1 // 16 bit, 2 pixels. + pextrw eax, xmm2, 1 // get x0 integer. next iteration. + pextrw edx, xmm2, 3 // get x1 integer. next iteration. + psrlw xmm0, 7 // 8.7 fixed point to low 8 bits. + packuswb xmm0, xmm0 // 8 bits, 2 pixels. + movd ebx, xmm0 + mov [edi], bx + lea edi, [edi + 2] + sub ecx, 2 // 2 pixels + jge xloop2 + + align 4 + xloop29: + + add ecx, 2 - 1 + jl xloop99 + + // 1 pixel remainder + movzx ebx, word ptr [esi + eax] // 2 source x0 pixels + movd xmm0, ebx + psrlw xmm2, 9 // 7 bit fractions. + pshufb xmm2, xmm5 // 0011 + pxor xmm2, xmm6 // 0..7f and 7f..0 + pmaddubsw xmm0, xmm2 // 16 bit + psrlw xmm0, 7 // 8.7 fixed point to low 8 bits. + packuswb xmm0, xmm0 // 8 bits + movd ebx, xmm0 + mov [edi], bl + + align 4 + xloop99: + + pop edi + pop esi + pop ebx + ret + } +} + +// Reads 16 pixels, duplicates them and writes 32 pixels. +// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned. +__declspec(naked) __declspec(align(16)) +void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr, + int dst_width, int x, int dx) { + __asm { + mov edx, [esp + 4] // dst_ptr + mov eax, [esp + 8] // src_ptr + mov ecx, [esp + 12] // dst_width + + align 4 + wloop: + movdqa xmm0, [eax] + lea eax, [eax + 16] + movdqa xmm1, xmm0 + punpcklbw xmm0, xmm0 + punpckhbw xmm1, xmm1 + sub ecx, 32 + movdqa [edx], xmm0 + movdqa [edx + 16], xmm1 + lea edx, [edx + 32] + jg wloop + + ret + } +} + +// Reads 8 pixels, throws half away and writes 4 even pixels (0, 2, 4, 6) +// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned. +__declspec(naked) __declspec(align(16)) +void ScaleARGBRowDown2_SSE2(const uint8* src_argb, + ptrdiff_t src_stride, + uint8* dst_argb, int dst_width) { + __asm { + mov eax, [esp + 4] // src_argb + // src_stride ignored + mov edx, [esp + 12] // dst_argb + mov ecx, [esp + 16] // dst_width + + align 4 + wloop: + movdqa xmm0, [eax] + movdqa xmm1, [eax + 16] + lea eax, [eax + 32] + shufps xmm0, xmm1, 0xdd + sub ecx, 4 + movdqa [edx], xmm0 + lea edx, [edx + 16] + jg wloop + + ret + } +} + +// Blends 8x1 rectangle to 4x1. +// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned. +__declspec(naked) __declspec(align(16)) +void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb, + ptrdiff_t src_stride, + uint8* dst_argb, int dst_width) { + __asm { + mov eax, [esp + 4] // src_argb + // src_stride ignored + mov edx, [esp + 12] // dst_argb + mov ecx, [esp + 16] // dst_width + + align 4 + wloop: + movdqa xmm0, [eax] + movdqa xmm1, [eax + 16] + lea eax, [eax + 32] + movdqa xmm2, xmm0 + shufps xmm0, xmm1, 0x88 // even pixels + shufps xmm2, xmm1, 0xdd // odd pixels + pavgb xmm0, xmm2 + sub ecx, 4 + movdqa [edx], xmm0 + lea edx, [edx + 16] + jg wloop + + ret + } +} + +// Blends 8x2 rectangle to 4x1. +// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned. +__declspec(naked) __declspec(align(16)) +void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb, + ptrdiff_t src_stride, + uint8* dst_argb, int dst_width) { + __asm { + push esi + mov eax, [esp + 4 + 4] // src_argb + mov esi, [esp + 4 + 8] // src_stride + mov edx, [esp + 4 + 12] // dst_argb + mov ecx, [esp + 4 + 16] // dst_width + + align 4 + wloop: + movdqa xmm0, [eax] + movdqa xmm1, [eax + 16] + movdqa xmm2, [eax + esi] + movdqa xmm3, [eax + esi + 16] + lea eax, [eax + 32] + pavgb xmm0, xmm2 // average rows + pavgb xmm1, xmm3 + movdqa xmm2, xmm0 // average columns (8 to 4 pixels) + shufps xmm0, xmm1, 0x88 // even pixels + shufps xmm2, xmm1, 0xdd // odd pixels + pavgb xmm0, xmm2 + sub ecx, 4 + movdqa [edx], xmm0 + lea edx, [edx + 16] + jg wloop + + pop esi + ret + } +} + +// Reads 4 pixels at a time. +// Alignment requirement: dst_argb 16 byte aligned. +__declspec(naked) __declspec(align(16)) +void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride, + int src_stepx, + uint8* dst_argb, int dst_width) { + __asm { + push ebx + push edi + mov eax, [esp + 8 + 4] // src_argb + // src_stride ignored + mov ebx, [esp + 8 + 12] // src_stepx + mov edx, [esp + 8 + 16] // dst_argb + mov ecx, [esp + 8 + 20] // dst_width + lea ebx, [ebx * 4] + lea edi, [ebx + ebx * 2] + + align 4 + wloop: + movd xmm0, [eax] + movd xmm1, [eax + ebx] + punpckldq xmm0, xmm1 + movd xmm2, [eax + ebx * 2] + movd xmm3, [eax + edi] + lea eax, [eax + ebx * 4] + punpckldq xmm2, xmm3 + punpcklqdq xmm0, xmm2 + sub ecx, 4 + movdqa [edx], xmm0 + lea edx, [edx + 16] + jg wloop + + pop edi + pop ebx + ret + } +} + +// Blends four 2x2 to 4x1. +// Alignment requirement: dst_argb 16 byte aligned. +__declspec(naked) __declspec(align(16)) +void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb, + ptrdiff_t src_stride, + int src_stepx, + uint8* dst_argb, int dst_width) { + __asm { + push ebx + push esi + push edi + mov eax, [esp + 12 + 4] // src_argb + mov esi, [esp + 12 + 8] // src_stride + mov ebx, [esp + 12 + 12] // src_stepx + mov edx, [esp + 12 + 16] // dst_argb + mov ecx, [esp + 12 + 20] // dst_width + lea esi, [eax + esi] // row1 pointer + lea ebx, [ebx * 4] + lea edi, [ebx + ebx * 2] + + align 4 + wloop: + movq xmm0, qword ptr [eax] // row0 4 pairs + movhps xmm0, qword ptr [eax + ebx] + movq xmm1, qword ptr [eax + ebx * 2] + movhps xmm1, qword ptr [eax + edi] + lea eax, [eax + ebx * 4] + movq xmm2, qword ptr [esi] // row1 4 pairs + movhps xmm2, qword ptr [esi + ebx] + movq xmm3, qword ptr [esi + ebx * 2] + movhps xmm3, qword ptr [esi + edi] + lea esi, [esi + ebx * 4] + pavgb xmm0, xmm2 // average rows + pavgb xmm1, xmm3 + movdqa xmm2, xmm0 // average columns (8 to 4 pixels) + shufps xmm0, xmm1, 0x88 // even pixels + shufps xmm2, xmm1, 0xdd // odd pixels + pavgb xmm0, xmm2 + sub ecx, 4 + movdqa [edx], xmm0 + lea edx, [edx + 16] + jg wloop + + pop edi + pop esi + pop ebx + ret + } +} + +// Column scaling unfiltered. SSE2 version. +__declspec(naked) __declspec(align(16)) +void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb, + int dst_width, int x, int dx) { + __asm { + push edi + push esi + mov edi, [esp + 8 + 4] // dst_argb + mov esi, [esp + 8 + 8] // src_argb + mov ecx, [esp + 8 + 12] // dst_width + movd xmm2, [esp + 8 + 16] // x + movd xmm3, [esp + 8 + 20] // dx + + pshufd xmm2, xmm2, 0 // x0 x0 x0 x0 + pshufd xmm0, xmm3, 0x11 // dx 0 dx 0 + paddd xmm2, xmm0 + paddd xmm3, xmm3 // 0, 0, 0, dx * 2 + pshufd xmm0, xmm3, 0x05 // dx * 2, dx * 2, 0, 0 + paddd xmm2, xmm0 // x3 x2 x1 x0 + paddd xmm3, xmm3 // 0, 0, 0, dx * 4 + pshufd xmm3, xmm3, 0 // dx * 4, dx * 4, dx * 4, dx * 4 + + pextrw eax, xmm2, 1 // get x0 integer. + pextrw edx, xmm2, 3 // get x1 integer. + + cmp ecx, 0 + jle xloop99 + sub ecx, 4 + jl xloop49 + + // 4 Pixel loop. + align 4 + xloop4: + movd xmm0, [esi + eax * 4] // 1 source x0 pixels + movd xmm1, [esi + edx * 4] // 1 source x1 pixels + pextrw eax, xmm2, 5 // get x2 integer. + pextrw edx, xmm2, 7 // get x3 integer. + paddd xmm2, xmm3 // x += dx + punpckldq xmm0, xmm1 // x0 x1 + + movd xmm1, [esi + eax * 4] // 1 source x2 pixels + movd xmm4, [esi + edx * 4] // 1 source x3 pixels + pextrw eax, xmm2, 1 // get x0 integer. next iteration. + pextrw edx, xmm2, 3 // get x1 integer. next iteration. + punpckldq xmm1, xmm4 // x2 x3 + punpcklqdq xmm0, xmm1 // x0 x1 x2 x3 + sub ecx, 4 // 4 pixels + movdqu [edi], xmm0 + lea edi, [edi + 16] + jge xloop4 + + align 4 + xloop49: + test ecx, 2 + je xloop29 + + // 2 Pixels. + movd xmm0, [esi + eax * 4] // 1 source x0 pixels + movd xmm1, [esi + edx * 4] // 1 source x1 pixels + pextrw eax, xmm2, 5 // get x2 integer. + punpckldq xmm0, xmm1 // x0 x1 + + movq qword ptr [edi], xmm0 + lea edi, [edi + 8] + + xloop29: + test ecx, 1 + je xloop99 + + // 1 Pixels. + movd xmm0, [esi + eax * 4] // 1 source x2 pixels + movd dword ptr [edi], xmm0 + align 4 + xloop99: + + pop esi + pop edi + ret + } +} + +// Bilinear row filtering combines 2x1 -> 1x1. SSSE3 version. +// TODO(fbarchard): Port to Neon + +// Shuffle table for arranging 2 pixels into pairs for pmaddubsw +static uvec8 kShuffleColARGB = { + 0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u, // bbggrraa 1st pixel + 8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u // bbggrraa 2nd pixel +}; + +// Shuffle table for duplicating 2 fractions into 8 bytes each +static uvec8 kShuffleFractions = { + 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, +}; + +__declspec(naked) __declspec(align(16)) +void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb, + int dst_width, int x, int dx) { + __asm { + push esi + push edi + mov edi, [esp + 8 + 4] // dst_argb + mov esi, [esp + 8 + 8] // src_argb + mov ecx, [esp + 8 + 12] // dst_width + movd xmm2, [esp + 8 + 16] // x + movd xmm3, [esp + 8 + 20] // dx + movdqa xmm4, kShuffleColARGB + movdqa xmm5, kShuffleFractions + pcmpeqb xmm6, xmm6 // generate 0x007f for inverting fraction. + psrlw xmm6, 9 + pextrw eax, xmm2, 1 // get x0 integer. preroll + sub ecx, 2 + jl xloop29 + + movdqa xmm0, xmm2 // x1 = x0 + dx + paddd xmm0, xmm3 + punpckldq xmm2, xmm0 // x0 x1 + punpckldq xmm3, xmm3 // dx dx + paddd xmm3, xmm3 // dx * 2, dx * 2 + pextrw edx, xmm2, 3 // get x1 integer. preroll + + // 2 Pixel loop. + align 4 + xloop2: + movdqa xmm1, xmm2 // x0, x1 fractions. + paddd xmm2, xmm3 // x += dx + movq xmm0, qword ptr [esi + eax * 4] // 2 source x0 pixels + psrlw xmm1, 9 // 7 bit fractions. + movhps xmm0, qword ptr [esi + edx * 4] // 2 source x1 pixels + pshufb xmm1, xmm5 // 0000000011111111 + pshufb xmm0, xmm4 // arrange pixels into pairs + pxor xmm1, xmm6 // 0..7f and 7f..0 + pmaddubsw xmm0, xmm1 // argb_argb 16 bit, 2 pixels. + pextrw eax, xmm2, 1 // get x0 integer. next iteration. + pextrw edx, xmm2, 3 // get x1 integer. next iteration. + psrlw xmm0, 7 // argb 8.7 fixed point to low 8 bits. + packuswb xmm0, xmm0 // argb_argb 8 bits, 2 pixels. + movq qword ptr [edi], xmm0 + lea edi, [edi + 8] + sub ecx, 2 // 2 pixels + jge xloop2 + + align 4 + xloop29: + + add ecx, 2 - 1 + jl xloop99 + + // 1 pixel remainder + psrlw xmm2, 9 // 7 bit fractions. + movq xmm0, qword ptr [esi + eax * 4] // 2 source x0 pixels + pshufb xmm2, xmm5 // 00000000 + pshufb xmm0, xmm4 // arrange pixels into pairs + pxor xmm2, xmm6 // 0..7f and 7f..0 + pmaddubsw xmm0, xmm2 // argb 16 bit, 1 pixel. + psrlw xmm0, 7 + packuswb xmm0, xmm0 // argb 8 bits, 1 pixel. + movd [edi], xmm0 + + align 4 + xloop99: + + pop edi + pop esi + ret + } +} + +// Reads 4 pixels, duplicates them and writes 8 pixels. +// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned. +__declspec(naked) __declspec(align(16)) +void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb, + int dst_width, int x, int dx) { + __asm { + mov edx, [esp + 4] // dst_argb + mov eax, [esp + 8] // src_argb + mov ecx, [esp + 12] // dst_width + + align 4 + wloop: + movdqa xmm0, [eax] + lea eax, [eax + 16] + movdqa xmm1, xmm0 + punpckldq xmm0, xmm0 + punpckhdq xmm1, xmm1 + sub ecx, 8 + movdqa [edx], xmm0 + movdqa [edx + 16], xmm1 + lea edx, [edx + 32] + jg wloop + + ret + } +} + +// Divide num by div and return as 16.16 fixed point result. +__declspec(naked) __declspec(align(16)) +int FixedDiv_X86(int num, int div) { + __asm { + mov eax, [esp + 4] // num + cdq // extend num to 64 bits + shld edx, eax, 16 // 32.16 + shl eax, 16 + idiv dword ptr [esp + 8] + ret + } +} + +// Divide num by div and return as 16.16 fixed point result. +__declspec(naked) __declspec(align(16)) +int FixedDiv1_X86(int num, int div) { + __asm { + mov eax, [esp + 4] // num + mov ecx, [esp + 8] // denom + cdq // extend num to 64 bits + shld edx, eax, 16 // 32.16 + shl eax, 16 + sub eax, 0x00010001 + sbb edx, 0 + sub ecx, 1 + idiv ecx + ret + } +} + +#endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER) + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/TMessagesProj/jni/libyuv/source/video_common.cc b/TMessagesProj/jni/libyuv/source/video_common.cc new file mode 100644 index 000000000..efbedf46e --- /dev/null +++ b/TMessagesProj/jni/libyuv/source/video_common.cc @@ -0,0 +1,64 @@ +/* + * Copyright 2011 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#include "libyuv/video_common.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +#define ARRAY_SIZE(x) (int)(sizeof(x) / sizeof(x[0])) + +struct FourCCAliasEntry { + uint32 alias; + uint32 canonical; +}; + +static const struct FourCCAliasEntry kFourCCAliases[] = { + {FOURCC_IYUV, FOURCC_I420}, + {FOURCC_YU16, FOURCC_I422}, + {FOURCC_YU24, FOURCC_I444}, + {FOURCC_YUYV, FOURCC_YUY2}, + {FOURCC_YUVS, FOURCC_YUY2}, // kCMPixelFormat_422YpCbCr8_yuvs + {FOURCC_HDYC, FOURCC_UYVY}, + {FOURCC_2VUY, FOURCC_UYVY}, // kCMPixelFormat_422YpCbCr8 + {FOURCC_JPEG, FOURCC_MJPG}, // Note: JPEG has DHT while MJPG does not. + {FOURCC_DMB1, FOURCC_MJPG}, + {FOURCC_BA81, FOURCC_BGGR}, + {FOURCC_RGB3, FOURCC_RAW }, + {FOURCC_BGR3, FOURCC_24BG}, + {FOURCC_CM32, FOURCC_BGRA}, // kCMPixelFormat_32ARGB + {FOURCC_CM24, FOURCC_RAW }, // kCMPixelFormat_24RGB + {FOURCC_L555, FOURCC_RGBO}, // kCMPixelFormat_16LE555 + {FOURCC_L565, FOURCC_RGBP}, // kCMPixelFormat_16LE565 + {FOURCC_5551, FOURCC_RGBO}, // kCMPixelFormat_16LE5551 +}; +// TODO(fbarchard): Consider mapping kCMPixelFormat_32BGRA to FOURCC_ARGB. +// {FOURCC_BGRA, FOURCC_ARGB}, // kCMPixelFormat_32BGRA + +LIBYUV_API +uint32 CanonicalFourCC(uint32 fourcc) { + int i; + for (i = 0; i < ARRAY_SIZE(kFourCCAliases); ++i) { + if (kFourCCAliases[i].alias == fourcc) { + return kFourCCAliases[i].canonical; + } + } + // Not an alias, so return it as-is. + return fourcc; +} + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif + diff --git a/TMessagesProj/jni/libyuv/source/x86inc.asm b/TMessagesProj/jni/libyuv/source/x86inc.asm new file mode 100644 index 000000000..cb5c32df3 --- /dev/null +++ b/TMessagesProj/jni/libyuv/source/x86inc.asm @@ -0,0 +1,1136 @@ +;***************************************************************************** +;* x86inc.asm: x264asm abstraction layer +;***************************************************************************** +;* Copyright (C) 2005-2012 x264 project +;* +;* Authors: Loren Merritt +;* Anton Mitrofanov +;* Jason Garrett-Glaser +;* Henrik Gramner +;* +;* Permission to use, copy, modify, and/or distribute this software for any +;* purpose with or without fee is hereby granted, provided that the above +;* copyright notice and this permission notice appear in all copies. +;* +;* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +;* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +;* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR +;* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +;* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN +;* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF +;* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +;***************************************************************************** + +; This is a header file for the x264ASM assembly language, which uses +; NASM/YASM syntax combined with a large number of macros to provide easy +; abstraction between different calling conventions (x86_32, win64, linux64). +; It also has various other useful features to simplify writing the kind of +; DSP functions that are most often used in x264. + +; Unlike the rest of x264, this file is available under an ISC license, as it +; has significant usefulness outside of x264 and we want it to be available +; to the largest audience possible. Of course, if you modify it for your own +; purposes to add a new feature, we strongly encourage contributing a patch +; as this feature might be useful for others as well. Send patches or ideas +; to x264-devel@videolan.org . + +; Local changes for libyuv: +; remove %define program_name and references in labels +; rename cpus to uppercase + +%define WIN64 0 +%define UNIX64 0 +%if ARCH_X86_64 + %ifidn __OUTPUT_FORMAT__,win32 + %define WIN64 1 + %elifidn __OUTPUT_FORMAT__,win64 + %define WIN64 1 + %else + %define UNIX64 1 + %endif +%endif + +%ifdef PREFIX + %define mangle(x) _ %+ x +%else + %define mangle(x) x +%endif + +; Name of the .rodata section. +; Kludge: Something on OS X fails to align .rodata even given an align attribute, +; so use a different read-only section. +%macro SECTION_RODATA 0-1 16 + %ifidn __OUTPUT_FORMAT__,macho64 + SECTION .text align=%1 + %elifidn __OUTPUT_FORMAT__,macho + SECTION .text align=%1 + fakegot: + %elifidn __OUTPUT_FORMAT__,aout + section .text + %else + SECTION .rodata align=%1 + %endif +%endmacro + +; aout does not support align= +%macro SECTION_TEXT 0-1 16 + %ifidn __OUTPUT_FORMAT__,aout + SECTION .text + %else + SECTION .text align=%1 + %endif +%endmacro + +%if WIN64 + %define PIC +%elif ARCH_X86_64 == 0 +; x86_32 doesn't require PIC. +; Some distros prefer shared objects to be PIC, but nothing breaks if +; the code contains a few textrels, so we'll skip that complexity. + %undef PIC +%endif +%ifdef PIC + default rel +%endif + +; Always use long nops (reduces 0x90 spam in disassembly on x86_32) +CPU amdnop + +; Macros to eliminate most code duplication between x86_32 and x86_64: +; Currently this works only for leaf functions which load all their arguments +; into registers at the start, and make no other use of the stack. Luckily that +; covers most of x264's asm. + +; PROLOGUE: +; %1 = number of arguments. loads them from stack if needed. +; %2 = number of registers used. pushes callee-saved regs if needed. +; %3 = number of xmm registers used. pushes callee-saved xmm regs if needed. +; %4 = list of names to define to registers +; PROLOGUE can also be invoked by adding the same options to cglobal + +; e.g. +; cglobal foo, 2,3,0, dst, src, tmp +; declares a function (foo), taking two args (dst and src) and one local variable (tmp) + +; TODO Some functions can use some args directly from the stack. If they're the +; last args then you can just not declare them, but if they're in the middle +; we need more flexible macro. + +; RET: +; Pops anything that was pushed by PROLOGUE, and returns. + +; REP_RET: +; Same, but if it doesn't pop anything it becomes a 2-byte ret, for athlons +; which are slow when a normal ret follows a branch. + +; registers: +; rN and rNq are the native-size register holding function argument N +; rNd, rNw, rNb are dword, word, and byte size +; rNh is the high 8 bits of the word size +; rNm is the original location of arg N (a register or on the stack), dword +; rNmp is native size + +%macro DECLARE_REG 2-3 + %define r%1q %2 + %define r%1d %2d + %define r%1w %2w + %define r%1b %2b + %define r%1h %2h + %if %0 == 2 + %define r%1m %2d + %define r%1mp %2 + %elif ARCH_X86_64 ; memory + %define r%1m [rsp + stack_offset + %3] + %define r%1mp qword r %+ %1m + %else + %define r%1m [esp + stack_offset + %3] + %define r%1mp dword r %+ %1m + %endif + %define r%1 %2 +%endmacro + +%macro DECLARE_REG_SIZE 3 + %define r%1q r%1 + %define e%1q r%1 + %define r%1d e%1 + %define e%1d e%1 + %define r%1w %1 + %define e%1w %1 + %define r%1h %3 + %define e%1h %3 + %define r%1b %2 + %define e%1b %2 +%if ARCH_X86_64 == 0 + %define r%1 e%1 +%endif +%endmacro + +DECLARE_REG_SIZE ax, al, ah +DECLARE_REG_SIZE bx, bl, bh +DECLARE_REG_SIZE cx, cl, ch +DECLARE_REG_SIZE dx, dl, dh +DECLARE_REG_SIZE si, sil, null +DECLARE_REG_SIZE di, dil, null +DECLARE_REG_SIZE bp, bpl, null + +; t# defines for when per-arch register allocation is more complex than just function arguments + +%macro DECLARE_REG_TMP 1-* + %assign %%i 0 + %rep %0 + CAT_XDEFINE t, %%i, r%1 + %assign %%i %%i+1 + %rotate 1 + %endrep +%endmacro + +%macro DECLARE_REG_TMP_SIZE 0-* + %rep %0 + %define t%1q t%1 %+ q + %define t%1d t%1 %+ d + %define t%1w t%1 %+ w + %define t%1h t%1 %+ h + %define t%1b t%1 %+ b + %rotate 1 + %endrep +%endmacro + +DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14 + +%if ARCH_X86_64 + %define gprsize 8 +%else + %define gprsize 4 +%endif + +%macro PUSH 1 + push %1 + %assign stack_offset stack_offset+gprsize +%endmacro + +%macro POP 1 + pop %1 + %assign stack_offset stack_offset-gprsize +%endmacro + +%macro PUSH_IF_USED 1-* + %rep %0 + %if %1 < regs_used + PUSH r%1 + %endif + %rotate 1 + %endrep +%endmacro + +%macro POP_IF_USED 1-* + %rep %0 + %if %1 < regs_used + pop r%1 + %endif + %rotate 1 + %endrep +%endmacro + +%macro LOAD_IF_USED 1-* + %rep %0 + %if %1 < num_args + mov r%1, r %+ %1 %+ mp + %endif + %rotate 1 + %endrep +%endmacro + +%macro SUB 2 + sub %1, %2 + %ifidn %1, rsp + %assign stack_offset stack_offset+(%2) + %endif +%endmacro + +%macro ADD 2 + add %1, %2 + %ifidn %1, rsp + %assign stack_offset stack_offset-(%2) + %endif +%endmacro + +%macro movifnidn 2 + %ifnidn %1, %2 + mov %1, %2 + %endif +%endmacro + +%macro movsxdifnidn 2 + %ifnidn %1, %2 + movsxd %1, %2 + %endif +%endmacro + +%macro ASSERT 1 + %if (%1) == 0 + %error assert failed + %endif +%endmacro + +%macro DEFINE_ARGS 0-* + %ifdef n_arg_names + %assign %%i 0 + %rep n_arg_names + CAT_UNDEF arg_name %+ %%i, q + CAT_UNDEF arg_name %+ %%i, d + CAT_UNDEF arg_name %+ %%i, w + CAT_UNDEF arg_name %+ %%i, h + CAT_UNDEF arg_name %+ %%i, b + CAT_UNDEF arg_name %+ %%i, m + CAT_UNDEF arg_name %+ %%i, mp + CAT_UNDEF arg_name, %%i + %assign %%i %%i+1 + %endrep + %endif + + %xdefine %%stack_offset stack_offset + %undef stack_offset ; so that the current value of stack_offset doesn't get baked in by xdefine + %assign %%i 0 + %rep %0 + %xdefine %1q r %+ %%i %+ q + %xdefine %1d r %+ %%i %+ d + %xdefine %1w r %+ %%i %+ w + %xdefine %1h r %+ %%i %+ h + %xdefine %1b r %+ %%i %+ b + %xdefine %1m r %+ %%i %+ m + %xdefine %1mp r %+ %%i %+ mp + CAT_XDEFINE arg_name, %%i, %1 + %assign %%i %%i+1 + %rotate 1 + %endrep + %xdefine stack_offset %%stack_offset + %assign n_arg_names %0 +%endmacro + +%if WIN64 ; Windows x64 ;================================================= + +DECLARE_REG 0, rcx +DECLARE_REG 1, rdx +DECLARE_REG 2, R8 +DECLARE_REG 3, R9 +DECLARE_REG 4, R10, 40 +DECLARE_REG 5, R11, 48 +DECLARE_REG 6, rax, 56 +DECLARE_REG 7, rdi, 64 +DECLARE_REG 8, rsi, 72 +DECLARE_REG 9, rbx, 80 +DECLARE_REG 10, rbp, 88 +DECLARE_REG 11, R12, 96 +DECLARE_REG 12, R13, 104 +DECLARE_REG 13, R14, 112 +DECLARE_REG 14, R15, 120 + +%macro PROLOGUE 2-4+ 0 ; #args, #regs, #xmm_regs, arg_names... + %assign num_args %1 + %assign regs_used %2 + ASSERT regs_used >= num_args + ASSERT regs_used <= 15 + PUSH_IF_USED 7, 8, 9, 10, 11, 12, 13, 14 + %if mmsize == 8 + %assign xmm_regs_used 0 + %else + WIN64_SPILL_XMM %3 + %endif + LOAD_IF_USED 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 + DEFINE_ARGS %4 +%endmacro + +%macro WIN64_SPILL_XMM 1 + %assign xmm_regs_used %1 + ASSERT xmm_regs_used <= 16 + %if xmm_regs_used > 6 + SUB rsp, (xmm_regs_used-6)*16+16 + %assign %%i xmm_regs_used + %rep (xmm_regs_used-6) + %assign %%i %%i-1 + movdqa [rsp + (%%i-6)*16+(~stack_offset&8)], xmm %+ %%i + %endrep + %endif +%endmacro + +%macro WIN64_RESTORE_XMM_INTERNAL 1 + %if xmm_regs_used > 6 + %assign %%i xmm_regs_used + %rep (xmm_regs_used-6) + %assign %%i %%i-1 + movdqa xmm %+ %%i, [%1 + (%%i-6)*16+(~stack_offset&8)] + %endrep + add %1, (xmm_regs_used-6)*16+16 + %endif +%endmacro + +%macro WIN64_RESTORE_XMM 1 + WIN64_RESTORE_XMM_INTERNAL %1 + %assign stack_offset stack_offset-(xmm_regs_used-6)*16+16 + %assign xmm_regs_used 0 +%endmacro + +%define has_epilogue regs_used > 7 || xmm_regs_used > 6 || mmsize == 32 + +%macro RET 0 + WIN64_RESTORE_XMM_INTERNAL rsp + POP_IF_USED 14, 13, 12, 11, 10, 9, 8, 7 +%if mmsize == 32 + vzeroupper +%endif + ret +%endmacro + +%elif ARCH_X86_64 ; *nix x64 ;============================================= + +DECLARE_REG 0, rdi +DECLARE_REG 1, rsi +DECLARE_REG 2, rdx +DECLARE_REG 3, rcx +DECLARE_REG 4, R8 +DECLARE_REG 5, R9 +DECLARE_REG 6, rax, 8 +DECLARE_REG 7, R10, 16 +DECLARE_REG 8, R11, 24 +DECLARE_REG 9, rbx, 32 +DECLARE_REG 10, rbp, 40 +DECLARE_REG 11, R12, 48 +DECLARE_REG 12, R13, 56 +DECLARE_REG 13, R14, 64 +DECLARE_REG 14, R15, 72 + +%macro PROLOGUE 2-4+ ; #args, #regs, #xmm_regs, arg_names... + %assign num_args %1 + %assign regs_used %2 + ASSERT regs_used >= num_args + ASSERT regs_used <= 15 + PUSH_IF_USED 9, 10, 11, 12, 13, 14 + LOAD_IF_USED 6, 7, 8, 9, 10, 11, 12, 13, 14 + DEFINE_ARGS %4 +%endmacro + +%define has_epilogue regs_used > 9 || mmsize == 32 + +%macro RET 0 + POP_IF_USED 14, 13, 12, 11, 10, 9 +%if mmsize == 32 + vzeroupper +%endif + ret +%endmacro + +%else ; X86_32 ;============================================================== + +DECLARE_REG 0, eax, 4 +DECLARE_REG 1, ecx, 8 +DECLARE_REG 2, edx, 12 +DECLARE_REG 3, ebx, 16 +DECLARE_REG 4, esi, 20 +DECLARE_REG 5, edi, 24 +DECLARE_REG 6, ebp, 28 +%define rsp esp + +%macro DECLARE_ARG 1-* + %rep %0 + %define r%1m [esp + stack_offset + 4*%1 + 4] + %define r%1mp dword r%1m + %rotate 1 + %endrep +%endmacro + +DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14 + +%macro PROLOGUE 2-4+ ; #args, #regs, #xmm_regs, arg_names... + %assign num_args %1 + %assign regs_used %2 + %if regs_used > 7 + %assign regs_used 7 + %endif + ASSERT regs_used >= num_args + PUSH_IF_USED 3, 4, 5, 6 + LOAD_IF_USED 0, 1, 2, 3, 4, 5, 6 + DEFINE_ARGS %4 +%endmacro + +%define has_epilogue regs_used > 3 || mmsize == 32 + +%macro RET 0 + POP_IF_USED 6, 5, 4, 3 +%if mmsize == 32 + vzeroupper +%endif + ret +%endmacro + +%endif ;====================================================================== + +%if WIN64 == 0 +%macro WIN64_SPILL_XMM 1 +%endmacro +%macro WIN64_RESTORE_XMM 1 +%endmacro +%endif + +%macro REP_RET 0 + %if has_epilogue + RET + %else + rep ret + %endif +%endmacro + +%macro TAIL_CALL 2 ; callee, is_nonadjacent + %if has_epilogue + call %1 + RET + %elif %2 + jmp %1 + %endif +%endmacro + +;============================================================================= +; arch-independent part +;============================================================================= + +%assign function_align 16 + +; Begin a function. +; Applies any symbol mangling needed for C linkage, and sets up a define such that +; subsequent uses of the function name automatically refer to the mangled version. +; Appends cpuflags to the function name if cpuflags has been specified. +%macro cglobal 1-2+ ; name, [PROLOGUE args] +%if %0 == 1 + cglobal_internal %1 %+ SUFFIX +%else + cglobal_internal %1 %+ SUFFIX, %2 +%endif +%endmacro +%macro cglobal_internal 1-2+ + %ifndef cglobaled_%1 + %xdefine %1 mangle(%1) + %xdefine %1.skip_prologue %1 %+ .skip_prologue + CAT_XDEFINE cglobaled_, %1, 1 + %endif + %xdefine current_function %1 + %ifidn __OUTPUT_FORMAT__,elf + global %1:function hidden + %else + global %1 + %endif + align function_align + %1: + RESET_MM_PERMUTATION ; not really needed, but makes disassembly somewhat nicer + %assign stack_offset 0 + %if %0 > 1 + PROLOGUE %2 + %endif +%endmacro + +%macro cextern 1 + %xdefine %1 mangle(%1) + CAT_XDEFINE cglobaled_, %1, 1 + extern %1 +%endmacro + +; like cextern, but without the prefix +%macro cextern_naked 1 + %xdefine %1 mangle(%1) + CAT_XDEFINE cglobaled_, %1, 1 + extern %1 +%endmacro + +%macro const 2+ + %xdefine %1 mangle(%1) + global %1 + %1: %2 +%endmacro + +; This is needed for ELF, otherwise the GNU linker assumes the stack is +; executable by default. +%ifidn __OUTPUT_FORMAT__,elf +SECTION .note.GNU-stack noalloc noexec nowrite progbits +%endif +%ifidn __OUTPUT_FORMAT__,elf32 +section .note.GNU-stack noalloc noexec nowrite progbits +%endif +%ifidn __OUTPUT_FORMAT__,elf64 +section .note.GNU-stack noalloc noexec nowrite progbits +%endif + +; cpuflags + +%assign cpuflags_MMX (1<<0) +%assign cpuflags_MMX2 (1<<1) | cpuflags_MMX +%assign cpuflags_3dnow (1<<2) | cpuflags_MMX +%assign cpuflags_3dnow2 (1<<3) | cpuflags_3dnow +%assign cpuflags_SSE (1<<4) | cpuflags_MMX2 +%assign cpuflags_SSE2 (1<<5) | cpuflags_SSE +%assign cpuflags_SSE2slow (1<<6) | cpuflags_SSE2 +%assign cpuflags_SSE3 (1<<7) | cpuflags_SSE2 +%assign cpuflags_SSSE3 (1<<8) | cpuflags_SSE3 +%assign cpuflags_SSE4 (1<<9) | cpuflags_SSSE3 +%assign cpuflags_SSE42 (1<<10)| cpuflags_SSE4 +%assign cpuflags_AVX (1<<11)| cpuflags_SSE42 +%assign cpuflags_xop (1<<12)| cpuflags_AVX +%assign cpuflags_fma4 (1<<13)| cpuflags_AVX +%assign cpuflags_AVX2 (1<<14)| cpuflags_AVX +%assign cpuflags_fma3 (1<<15)| cpuflags_AVX + +%assign cpuflags_cache32 (1<<16) +%assign cpuflags_cache64 (1<<17) +%assign cpuflags_slowctz (1<<18) +%assign cpuflags_lzcnt (1<<19) +%assign cpuflags_misalign (1<<20) +%assign cpuflags_aligned (1<<21) ; not a cpu feature, but a function variant +%assign cpuflags_atom (1<<22) +%assign cpuflags_bmi1 (1<<23) +%assign cpuflags_bmi2 (1<<24)|cpuflags_bmi1 +%assign cpuflags_tbm (1<<25)|cpuflags_bmi1 + +%define cpuflag(x) ((cpuflags & (cpuflags_ %+ x)) == (cpuflags_ %+ x)) +%define notcpuflag(x) ((cpuflags & (cpuflags_ %+ x)) != (cpuflags_ %+ x)) + +; Takes up to 2 cpuflags from the above list. +; All subsequent functions (up to the next INIT_CPUFLAGS) is built for the specified cpu. +; You shouldn't need to invoke this macro directly, it's a subroutine for INIT_MMX &co. +%macro INIT_CPUFLAGS 0-2 + %if %0 >= 1 + %xdefine cpuname %1 + %assign cpuflags cpuflags_%1 + %if %0 >= 2 + %xdefine cpuname %1_%2 + %assign cpuflags cpuflags | cpuflags_%2 + %endif + %xdefine SUFFIX _ %+ cpuname + %if cpuflag(AVX) + %assign AVX_enabled 1 + %endif + %if mmsize == 16 && notcpuflag(SSE2) + %define mova movaps + %define movu movups + %define movnta movntps + %endif + %if cpuflag(aligned) + %define movu mova + %elifidn %1, SSE3 + %define movu lddqu + %endif + %else + %xdefine SUFFIX + %undef cpuname + %undef cpuflags + %endif +%endmacro + +; merge MMX and SSE* + +%macro CAT_XDEFINE 3 + %xdefine %1%2 %3 +%endmacro + +%macro CAT_UNDEF 2 + %undef %1%2 +%endmacro + +%macro INIT_MMX 0-1+ + %assign AVX_enabled 0 + %define RESET_MM_PERMUTATION INIT_MMX %1 + %define mmsize 8 + %define num_mmregs 8 + %define mova movq + %define movu movq + %define movh movd + %define movnta movntq + %assign %%i 0 + %rep 8 + CAT_XDEFINE m, %%i, mm %+ %%i + CAT_XDEFINE nmm, %%i, %%i + %assign %%i %%i+1 + %endrep + %rep 8 + CAT_UNDEF m, %%i + CAT_UNDEF nmm, %%i + %assign %%i %%i+1 + %endrep + INIT_CPUFLAGS %1 +%endmacro + +%macro INIT_XMM 0-1+ + %assign AVX_enabled 0 + %define RESET_MM_PERMUTATION INIT_XMM %1 + %define mmsize 16 + %define num_mmregs 8 + %if ARCH_X86_64 + %define num_mmregs 16 + %endif + %define mova movdqa + %define movu movdqu + %define movh movq + %define movnta movntdq + %assign %%i 0 + %rep num_mmregs + CAT_XDEFINE m, %%i, xmm %+ %%i + CAT_XDEFINE nxmm, %%i, %%i + %assign %%i %%i+1 + %endrep + INIT_CPUFLAGS %1 +%endmacro + +%macro INIT_YMM 0-1+ + %assign AVX_enabled 1 + %define RESET_MM_PERMUTATION INIT_YMM %1 + %define mmsize 32 + %define num_mmregs 8 + %if ARCH_X86_64 + %define num_mmregs 16 + %endif + %define mova vmovaps + %define movu vmovups + %undef movh + %define movnta vmovntps + %assign %%i 0 + %rep num_mmregs + CAT_XDEFINE m, %%i, ymm %+ %%i + CAT_XDEFINE nymm, %%i, %%i + %assign %%i %%i+1 + %endrep + INIT_CPUFLAGS %1 +%endmacro + +INIT_XMM + +; I often want to use macros that permute their arguments. e.g. there's no +; efficient way to implement butterfly or transpose or dct without swapping some +; arguments. +; +; I would like to not have to manually keep track of the permutations: +; If I insert a permutation in the middle of a function, it should automatically +; change everything that follows. For more complex macros I may also have multiple +; implementations, e.g. the SSE2 and SSSE3 versions may have different permutations. +; +; Hence these macros. Insert a PERMUTE or some SWAPs at the end of a macro that +; permutes its arguments. It's equivalent to exchanging the contents of the +; registers, except that this way you exchange the register names instead, so it +; doesn't cost any cycles. + +%macro PERMUTE 2-* ; takes a list of pairs to swap +%rep %0/2 + %xdefine tmp%2 m%2 + %xdefine ntmp%2 nm%2 + %rotate 2 +%endrep +%rep %0/2 + %xdefine m%1 tmp%2 + %xdefine nm%1 ntmp%2 + %undef tmp%2 + %undef ntmp%2 + %rotate 2 +%endrep +%endmacro + +%macro SWAP 2-* ; swaps a single chain (sometimes more concise than pairs) +%rep %0-1 +%ifdef m%1 + %xdefine tmp m%1 + %xdefine m%1 m%2 + %xdefine m%2 tmp + CAT_XDEFINE n, m%1, %1 + CAT_XDEFINE n, m%2, %2 +%else + ; If we were called as "SWAP m0,m1" rather than "SWAP 0,1" infer the original numbers here. + ; Be careful using this mode in nested macros though, as in some cases there may be + ; other copies of m# that have already been dereferenced and don't get updated correctly. + %xdefine %%n1 n %+ %1 + %xdefine %%n2 n %+ %2 + %xdefine tmp m %+ %%n1 + CAT_XDEFINE m, %%n1, m %+ %%n2 + CAT_XDEFINE m, %%n2, tmp + CAT_XDEFINE n, m %+ %%n1, %%n1 + CAT_XDEFINE n, m %+ %%n2, %%n2 +%endif + %undef tmp + %rotate 1 +%endrep +%endmacro + +; If SAVE_MM_PERMUTATION is placed at the end of a function, then any later +; calls to that function will automatically load the permutation, so values can +; be returned in mmregs. +%macro SAVE_MM_PERMUTATION 0-1 + %if %0 + %xdefine %%f %1_m + %else + %xdefine %%f current_function %+ _m + %endif + %assign %%i 0 + %rep num_mmregs + CAT_XDEFINE %%f, %%i, m %+ %%i + %assign %%i %%i+1 + %endrep +%endmacro + +%macro LOAD_MM_PERMUTATION 1 ; name to load from + %ifdef %1_m0 + %assign %%i 0 + %rep num_mmregs + CAT_XDEFINE m, %%i, %1_m %+ %%i + CAT_XDEFINE n, m %+ %%i, %%i + %assign %%i %%i+1 + %endrep + %endif +%endmacro + +; Append cpuflags to the callee's name iff the appended name is known and the plain name isn't +%macro call 1 + call_internal %1, %1 %+ SUFFIX +%endmacro +%macro call_internal 2 + %xdefine %%i %1 + %ifndef cglobaled_%1 + %ifdef cglobaled_%2 + %xdefine %%i %2 + %endif + %endif + call %%i + LOAD_MM_PERMUTATION %%i +%endmacro + +; Substitutions that reduce instruction size but are functionally equivalent +%macro add 2 + %ifnum %2 + %if %2==128 + sub %1, -128 + %else + add %1, %2 + %endif + %else + add %1, %2 + %endif +%endmacro + +%macro sub 2 + %ifnum %2 + %if %2==128 + add %1, -128 + %else + sub %1, %2 + %endif + %else + sub %1, %2 + %endif +%endmacro + +;============================================================================= +; AVX abstraction layer +;============================================================================= + +%assign i 0 +%rep 16 + %if i < 8 + CAT_XDEFINE sizeofmm, i, 8 + %endif + CAT_XDEFINE sizeofxmm, i, 16 + CAT_XDEFINE sizeofymm, i, 32 +%assign i i+1 +%endrep +%undef i + +%macro CHECK_AVX_INSTR_EMU 3-* + %xdefine %%opcode %1 + %xdefine %%dst %2 + %rep %0-2 + %ifidn %%dst, %3 + %error non-AVX emulation of ``%%opcode'' is not supported + %endif + %rotate 1 + %endrep +%endmacro + +;%1 == instruction +;%2 == 1 if float, 0 if int +;%3 == 1 if 4-operand (xmm, xmm, xmm, imm), 0 if 2- or 3-operand (xmm, xmm, xmm) +;%4 == number of operands given +;%5+: operands +%macro RUN_AVX_INSTR 6-7+ + %ifid %6 + %define %%sizeofreg sizeof%6 + %elifid %5 + %define %%sizeofreg sizeof%5 + %else + %define %%sizeofreg mmsize + %endif + %if %%sizeofreg==32 + %if %4>=3 + v%1 %5, %6, %7 + %else + v%1 %5, %6 + %endif + %else + %if %%sizeofreg==8 + %define %%regmov movq + %elif %2 + %define %%regmov movaps + %else + %define %%regmov movdqa + %endif + + %if %4>=3+%3 + %ifnidn %5, %6 + %if AVX_enabled && %%sizeofreg==16 + v%1 %5, %6, %7 + %else + CHECK_AVX_INSTR_EMU {%1 %5, %6, %7}, %5, %7 + %%regmov %5, %6 + %1 %5, %7 + %endif + %else + %1 %5, %7 + %endif + %elif %4>=3 + %1 %5, %6, %7 + %else + %1 %5, %6 + %endif + %endif +%endmacro + +; 3arg AVX ops with a memory arg can only have it in src2, +; whereas SSE emulation of 3arg prefers to have it in src1 (i.e. the mov). +; So, if the op is symmetric and the wrong one is memory, swap them. +%macro RUN_AVX_INSTR1 8 + %assign %%swap 0 + %if AVX_enabled + %ifnid %6 + %assign %%swap 1 + %endif + %elifnidn %5, %6 + %ifnid %7 + %assign %%swap 1 + %endif + %endif + %if %%swap && %3 == 0 && %8 == 1 + RUN_AVX_INSTR %1, %2, %3, %4, %5, %7, %6 + %else + RUN_AVX_INSTR %1, %2, %3, %4, %5, %6, %7 + %endif +%endmacro + +;%1 == instruction +;%2 == 1 if float, 0 if int +;%3 == 1 if 4-operand (xmm, xmm, xmm, imm), 0 if 2- or 3-operand (xmm, xmm, xmm) +;%4 == 1 if symmetric (i.e. doesn't matter which src arg is which), 0 if not +%macro AVX_INSTR 4 + %macro %1 2-9 fnord, fnord, fnord, %1, %2, %3, %4 + %ifidn %3, fnord + RUN_AVX_INSTR %6, %7, %8, 2, %1, %2 + %elifidn %4, fnord + RUN_AVX_INSTR1 %6, %7, %8, 3, %1, %2, %3, %9 + %elifidn %5, fnord + RUN_AVX_INSTR %6, %7, %8, 4, %1, %2, %3, %4 + %else + RUN_AVX_INSTR %6, %7, %8, 5, %1, %2, %3, %4, %5 + %endif + %endmacro +%endmacro + +AVX_INSTR addpd, 1, 0, 1 +AVX_INSTR addps, 1, 0, 1 +AVX_INSTR addsd, 1, 0, 1 +AVX_INSTR addss, 1, 0, 1 +AVX_INSTR addsubpd, 1, 0, 0 +AVX_INSTR addsubps, 1, 0, 0 +AVX_INSTR andpd, 1, 0, 1 +AVX_INSTR andps, 1, 0, 1 +AVX_INSTR andnpd, 1, 0, 0 +AVX_INSTR andnps, 1, 0, 0 +AVX_INSTR blendpd, 1, 0, 0 +AVX_INSTR blendps, 1, 0, 0 +AVX_INSTR blendvpd, 1, 0, 0 +AVX_INSTR blendvps, 1, 0, 0 +AVX_INSTR cmppd, 1, 0, 0 +AVX_INSTR cmpps, 1, 0, 0 +AVX_INSTR cmpsd, 1, 0, 0 +AVX_INSTR cmpss, 1, 0, 0 +AVX_INSTR cvtdq2ps, 1, 0, 0 +AVX_INSTR cvtps2dq, 1, 0, 0 +AVX_INSTR divpd, 1, 0, 0 +AVX_INSTR divps, 1, 0, 0 +AVX_INSTR divsd, 1, 0, 0 +AVX_INSTR divss, 1, 0, 0 +AVX_INSTR dppd, 1, 1, 0 +AVX_INSTR dpps, 1, 1, 0 +AVX_INSTR haddpd, 1, 0, 0 +AVX_INSTR haddps, 1, 0, 0 +AVX_INSTR hsubpd, 1, 0, 0 +AVX_INSTR hsubps, 1, 0, 0 +AVX_INSTR maxpd, 1, 0, 1 +AVX_INSTR maxps, 1, 0, 1 +AVX_INSTR maxsd, 1, 0, 1 +AVX_INSTR maxss, 1, 0, 1 +AVX_INSTR minpd, 1, 0, 1 +AVX_INSTR minps, 1, 0, 1 +AVX_INSTR minsd, 1, 0, 1 +AVX_INSTR minss, 1, 0, 1 +AVX_INSTR movhlps, 1, 0, 0 +AVX_INSTR movlhps, 1, 0, 0 +AVX_INSTR movsd, 1, 0, 0 +AVX_INSTR movss, 1, 0, 0 +AVX_INSTR mpsadbw, 0, 1, 0 +AVX_INSTR mulpd, 1, 0, 1 +AVX_INSTR mulps, 1, 0, 1 +AVX_INSTR mulsd, 1, 0, 1 +AVX_INSTR mulss, 1, 0, 1 +AVX_INSTR orpd, 1, 0, 1 +AVX_INSTR orps, 1, 0, 1 +AVX_INSTR pabsb, 0, 0, 0 +AVX_INSTR pabsw, 0, 0, 0 +AVX_INSTR pabsd, 0, 0, 0 +AVX_INSTR packsswb, 0, 0, 0 +AVX_INSTR packssdw, 0, 0, 0 +AVX_INSTR packuswb, 0, 0, 0 +AVX_INSTR packusdw, 0, 0, 0 +AVX_INSTR paddb, 0, 0, 1 +AVX_INSTR paddw, 0, 0, 1 +AVX_INSTR paddd, 0, 0, 1 +AVX_INSTR paddq, 0, 0, 1 +AVX_INSTR paddsb, 0, 0, 1 +AVX_INSTR paddsw, 0, 0, 1 +AVX_INSTR paddusb, 0, 0, 1 +AVX_INSTR paddusw, 0, 0, 1 +AVX_INSTR palignr, 0, 1, 0 +AVX_INSTR pand, 0, 0, 1 +AVX_INSTR pandn, 0, 0, 0 +AVX_INSTR pavgb, 0, 0, 1 +AVX_INSTR pavgw, 0, 0, 1 +AVX_INSTR pblendvb, 0, 0, 0 +AVX_INSTR pblendw, 0, 1, 0 +AVX_INSTR pcmpestri, 0, 0, 0 +AVX_INSTR pcmpestrm, 0, 0, 0 +AVX_INSTR pcmpistri, 0, 0, 0 +AVX_INSTR pcmpistrm, 0, 0, 0 +AVX_INSTR pcmpeqb, 0, 0, 1 +AVX_INSTR pcmpeqw, 0, 0, 1 +AVX_INSTR pcmpeqd, 0, 0, 1 +AVX_INSTR pcmpeqq, 0, 0, 1 +AVX_INSTR pcmpgtb, 0, 0, 0 +AVX_INSTR pcmpgtw, 0, 0, 0 +AVX_INSTR pcmpgtd, 0, 0, 0 +AVX_INSTR pcmpgtq, 0, 0, 0 +AVX_INSTR phaddw, 0, 0, 0 +AVX_INSTR phaddd, 0, 0, 0 +AVX_INSTR phaddsw, 0, 0, 0 +AVX_INSTR phsubw, 0, 0, 0 +AVX_INSTR phsubd, 0, 0, 0 +AVX_INSTR phsubsw, 0, 0, 0 +AVX_INSTR pmaddwd, 0, 0, 1 +AVX_INSTR pmaddubsw, 0, 0, 0 +AVX_INSTR pmaxsb, 0, 0, 1 +AVX_INSTR pmaxsw, 0, 0, 1 +AVX_INSTR pmaxsd, 0, 0, 1 +AVX_INSTR pmaxub, 0, 0, 1 +AVX_INSTR pmaxuw, 0, 0, 1 +AVX_INSTR pmaxud, 0, 0, 1 +AVX_INSTR pminsb, 0, 0, 1 +AVX_INSTR pminsw, 0, 0, 1 +AVX_INSTR pminsd, 0, 0, 1 +AVX_INSTR pminub, 0, 0, 1 +AVX_INSTR pminuw, 0, 0, 1 +AVX_INSTR pminud, 0, 0, 1 +AVX_INSTR pmovmskb, 0, 0, 0 +AVX_INSTR pmulhuw, 0, 0, 1 +AVX_INSTR pmulhrsw, 0, 0, 1 +AVX_INSTR pmulhw, 0, 0, 1 +AVX_INSTR pmullw, 0, 0, 1 +AVX_INSTR pmulld, 0, 0, 1 +AVX_INSTR pmuludq, 0, 0, 1 +AVX_INSTR pmuldq, 0, 0, 1 +AVX_INSTR por, 0, 0, 1 +AVX_INSTR psadbw, 0, 0, 1 +AVX_INSTR pshufb, 0, 0, 0 +AVX_INSTR pshufd, 0, 1, 0 +AVX_INSTR pshufhw, 0, 1, 0 +AVX_INSTR pshuflw, 0, 1, 0 +AVX_INSTR psignb, 0, 0, 0 +AVX_INSTR psignw, 0, 0, 0 +AVX_INSTR psignd, 0, 0, 0 +AVX_INSTR psllw, 0, 0, 0 +AVX_INSTR pslld, 0, 0, 0 +AVX_INSTR psllq, 0, 0, 0 +AVX_INSTR pslldq, 0, 0, 0 +AVX_INSTR psraw, 0, 0, 0 +AVX_INSTR psrad, 0, 0, 0 +AVX_INSTR psrlw, 0, 0, 0 +AVX_INSTR psrld, 0, 0, 0 +AVX_INSTR psrlq, 0, 0, 0 +AVX_INSTR psrldq, 0, 0, 0 +AVX_INSTR psubb, 0, 0, 0 +AVX_INSTR psubw, 0, 0, 0 +AVX_INSTR psubd, 0, 0, 0 +AVX_INSTR psubq, 0, 0, 0 +AVX_INSTR psubsb, 0, 0, 0 +AVX_INSTR psubsw, 0, 0, 0 +AVX_INSTR psubusb, 0, 0, 0 +AVX_INSTR psubusw, 0, 0, 0 +AVX_INSTR ptest, 0, 0, 0 +AVX_INSTR punpckhbw, 0, 0, 0 +AVX_INSTR punpckhwd, 0, 0, 0 +AVX_INSTR punpckhdq, 0, 0, 0 +AVX_INSTR punpckhqdq, 0, 0, 0 +AVX_INSTR punpcklbw, 0, 0, 0 +AVX_INSTR punpcklwd, 0, 0, 0 +AVX_INSTR punpckldq, 0, 0, 0 +AVX_INSTR punpcklqdq, 0, 0, 0 +AVX_INSTR pxor, 0, 0, 1 +AVX_INSTR shufps, 1, 1, 0 +AVX_INSTR subpd, 1, 0, 0 +AVX_INSTR subps, 1, 0, 0 +AVX_INSTR subsd, 1, 0, 0 +AVX_INSTR subss, 1, 0, 0 +AVX_INSTR unpckhpd, 1, 0, 0 +AVX_INSTR unpckhps, 1, 0, 0 +AVX_INSTR unpcklpd, 1, 0, 0 +AVX_INSTR unpcklps, 1, 0, 0 +AVX_INSTR xorpd, 1, 0, 1 +AVX_INSTR xorps, 1, 0, 1 + +; 3DNow instructions, for sharing code between AVX, SSE and 3DN +AVX_INSTR pfadd, 1, 0, 1 +AVX_INSTR pfsub, 1, 0, 0 +AVX_INSTR pfmul, 1, 0, 1 + +; base-4 constants for shuffles +%assign i 0 +%rep 256 + %assign j ((i>>6)&3)*1000 + ((i>>4)&3)*100 + ((i>>2)&3)*10 + (i&3) + %if j < 10 + CAT_XDEFINE q000, j, i + %elif j < 100 + CAT_XDEFINE q00, j, i + %elif j < 1000 + CAT_XDEFINE q0, j, i + %else + CAT_XDEFINE q, j, i + %endif +%assign i i+1 +%endrep +%undef i +%undef j + +%macro FMA_INSTR 3 + %macro %1 4-7 %1, %2, %3 + %if cpuflag(xop) + v%5 %1, %2, %3, %4 + %else + %6 %1, %2, %3 + %7 %1, %4 + %endif + %endmacro +%endmacro + +FMA_INSTR pmacsdd, pmulld, paddd +FMA_INSTR pmacsww, pmullw, paddw +FMA_INSTR pmadcswd, pmaddwd, paddd + +; tzcnt is equivalent to "rep bsf" and is backwards-compatible with bsf. +; This lets us use tzcnt without bumping the yasm version requirement yet. +%define tzcnt rep bsf diff --git a/TMessagesProj/libs/armeabi-v7a/libtmessages.so b/TMessagesProj/libs/armeabi-v7a/libtmessages.so index 4edaba1544e39b2a60ca64b19486f189e040aab3..07aee65fe6d34457ce7d9f225f5490588cda7ee3 100755 GIT binary patch delta 54143 zcmZ_14_s7L{{Mf@x&LPFAR-|V5+M>I5uvdz5fTwiXhcXXk`fV;u|&kWtdS9ru|~ui z3F~2vHP%>{HP%SULx##4>l#bO8f#|cvPKHSFib{9X7GJqc=u~QJsv+Fk5|w4>zq6H zo_qe?J9o~d=08U^e-T-`klyl2V0>&RokI*M1pavtLIH`EIV5qxtcn8($z4K-iunv! z3wgCS8G2>+@|#*qE#wDP#SmghhOG>xeo>t3vbQ>k{(=Rqt6NL`S>nnAn`s$ke(UP4 zuviwf>OizquCN7j>fSXy!CL=Q7iunK8U1xT&2P?wwOulbLK z1YrJt@E-rN5D7i(>Ml~^Qm*r`K_2YDs2}41)8Mng8}(}VMQ{Seu*3Q2Zv$V{Ul(oh z56nJeA)c808%Xx~#6sH9*BT$|uiI=1&xwLhgPYmg5CoNQ^q?7UA>V+Hg!~_h=QD?p z7MJ`G@F|cFjXCrAIPeMJp^!6v9r&pJI^FGlqXE{rjRp2XA&eHBDf?VPl3WTI1HSl7 zUH+V8Py0s}0_{Ds*}IV3{ZU=doWk-mur}2YOuM7s6QLk&S zA*BD)A3JR^csqDwtQq;+!25x(Lcbe)^QUz?J(9}_B>zzUd*E=g%WfV5-v|DO=6f1^ z#eju`;BK~qkNM0(qS3zrzQrZK8@w0f|FHdM;3py954XSU1&8h%sLP+1#fk^&dgg^Q zuYm(s=J`vk^3Qd?^LMfKa|aI1_ou8P@xWQ{aFtc29MG>1S6Ipib||8V;MK)J5Oq zU?lZGXxK^SnZ0VZXfu0T4*h!70sWS6SxO*t4%~k0K7|G!utnX$XhL1+?Qg<=&)$AZ zIqW%arV%{Jcp@R0E=QUKpW))kO6YDb-UPlr;m4ABg3kkQnyDR65&*u!#fO5ga`92% zbBbIN?tz3B@UzY3Nqzyo!Nso!U*fX;ufaR-g4r1l_%!&w_&Fp1PpATXR6?CC=7xnU ze%R|g(g>-IE_-bS-vm{%;^9YJQlDN0^MuRE{uX>9_#aM=gO79Ze+3`qvi$+@CYXCm zv2~Av_gYN z(DxJn8Tbsy55fGApZKrAS3rIo=1Z&pub;cRoRBWqpcpr}9yW0Gq2OHy;&Skj$rjQG z-N-0V0{9s4_2}>WiC+so4)SX;fBjENlN1pW-<2VepDPyFlPCn3KE z^WXZ3KLXyhul0a;ZDQ{;kWi31dwnv_XbgNyYTcoD|ML1LECgn?nG^L%HTtN9M4=A^ zUkknneFXS~f*-fP1AG$XCt&_c@RJ)XqySqw9eh1(pN9Fl;9d9kt5mWG?u_6AxIr0g zFa!n4g&Dvo=L_HyH`Z;w%Rk3!-Rv!l(Pg_Jxj*~IF8C?|N{knFXlE;J#WwdK^UP4F)dgF+<1#l|Sx1;`z6+36Jc zUKj5LXKi=!KLa1-a?tz0*Sh#e!Ixyso}{t=V(=B(VWtz7LlyrK5(-@AtiS!l9|m9HlHUuy#l?RLzSG6q!1uZMZ;gDoTQL54 zkT0i6#*bb%65Jr6HQ~VK#K$(*-8S^(bv1CqeJyNkd=)fUk8G%WYv_i$!J)kChA?Fq zX-7Ki?);+s&P2c2&t)RBk-5ksWHGV?S&FQvqZe){??l&yv?HBJzw2j@5`auXrXbUh z8OTgzHZl)cfGkFq%(9%6VnPM75?O_;LDnMckqyX3WD~L(*@A3EwjtY*9Wykp@R-nz zJb~;*_96R`XOIKPLF5o}7->g3krT*C%nPWepN@5F>I zWH<5zvKQHh>_?tK4j>1SL&#yI9qB|)ASeCHXRH1yCXgGjDUc?l8`2BuhYUalAw!U1 z$OvR4G69*0Oe)791(}A-KxQJdk-5k`WC5}WS&S?}mLe;VmB=b&c|8UV$VOxnvKiTe zY(;h;yOG04JJN|H0kbvgh73YR%(9#$VnP}+8<~sDLlz*5kj2OnWGS)&S&6Jd)*x$< z^)rO=*MJF)$R=bnvIW_SY(usqJCL2oE@U_I1hN;|hwL|KjQ=y3Fn}CH4k3q;cBB(I zft*BMMUsWsiIFCx8`1;m1)LfGKA7N#3_u1ULy%#}2xKHO3K@fpLna^-kx9rDWZFU; z{~4H&iOfdkBJ+?1$RcDhvIJR*tUy*GtB^IwT4epg^4Y3tz=TF*6S5iEf^0>$A={B1 z$WCMzvKx5<*^BH$_LpOD204HnL=GW`k#?jLIf0x+UPY2X>;p&>(hcc>Ece2|2kD0l zKn5X0kYUIOWF#^Q8H0>NCLj}$NywC0mXkD0$UtTyvyr*TJY)f~2w99QL6#ybkd??P zWDT-*hA{r>F`)t3h-^YOBU_NI$Tnm_T=UPau1deFly3-;W7rkORm;jKPIFgGm&}7Vq^uf2HAjYMz$e4ktdM-$U&qX zIk^bOzZ^7MQV*mbG6Wfkj6)_NGmyE+B4jDD3R#bALbf71f^htIV?rNt06C1DK$070 z3+jgSK?WfskTJ+aWEwIXS%54l$Dk5fgRDn3BAbz|$aZ8W@(gkac@=32#y)`bLHZ%f z1270eh9JX`5y&WH95Ml!h)hDJAk&Z;$V_B5GIy5cBo7k`kVVL1WC^kYS&6Jd)*x$< z^~eTfBeDtEjBJ@9jK5Y)XhXImJCL2oE@U_I1hN;|hwMk5K@K1XkweH~gU0x`V}cVo zft*BMMUus{Z7Cy7NH?Sh(hKQ>^g{+9gODM>neiWn2@%LhWE3(68HY?jCL)uNDabTr z1~L_(nI_9FX`{m3)O0puWZ2svDifgR~YP9P_dSCIsMhBUsi$w(8@4e5dOLi!;6 zkO9abWO)b%VaNz%Br*yagN#EaAQO>E$P{E6G6R{3%tq$U62?C!6d;R{rN|0oC9(!t zi>yaBARCcQ$Yx{knPA0WGAu<*^NAb>|KK6zYi1o zk!O$t$U)=~au{hxI*}8|N#s=|3B{&Gx*~F|q_%imX6ZBCC)!vn(gIm{5;wKsF+qkj=;z zWGk`_*^cZ$b|Slw-N+Nj-WkI9>%)Y8jQ=!D$UtTyvyr*TJY)f~ z2w99QL6#ybkd??PWPR9djWmSe_;197CS)_R1=)&hL$)J3ke$dbWH<5zvKQHh>_?sn z!|^|W34_QX$Pi>0G6ETij6%jB z3!K4kw4Vf>xJgaPCratJw$v?HC!3FIV_gk#$yO-MJS z2ht1aW6&7?ewYw|3_^w=!;lfkNMsZ;1{sG;KqexSkSWMCWCn0%{AXf9HZm7kfGk25 zBTJB_NUx={XYPlLM8+XAkh#cGWYtm}|4o?CitI-AA%~F@NVjFP2lPS4ATyAe$ZTW@ zvJ%;X>|TcB{{$xVB8QMpr0Le#19~CDkWt7KWG1p0S%GXoHX}QcC(1DxMB0&Z@G zkRixOWD+t1S%fS_)+3vc-N<1i`PuCL%iS;tLPj7Hk!i>RWC^ks*@$dMb|KFohmcp1 zrscB-4wz**3B!a0WC}75S&XbfHXz%OoydOVAaW8Z|9tjyG5{HYj6xkrpL`ETFkO{~%WF|5fS%9oU)*x$<4F-+z--rp#$Tnmr@&vLUIf%3)Cz0}s z*&6Xc`XNJ*k;ph?5^!exXJA4uG7njVEJao!YmoKGCS)tJ7Vd`UUjvIZ;lT#8QPKhh zWM5L!Ztz=1x(r@nq&Eh}Kl_f6Ge*KyMurT2&xq6D6eCv+PBRh!J7%zAI>8{7NTxw1 zkz#|AL@Er*5@|4~NTk`IDv>sWnnbR`E!zCe63K%bmidciQep6CGN}!R@$U_bwCW+j z8&+X97=$(WjRxnz@?L{JuzJnl0$31a5EgN_8oUV>i5Xl0&yWlj!kTu2Prn3!N(KymPRXFbAxee}enH8w!HbmG z4USOaGze8ZVQ`$1NrO<;R}FpzXLJitVnjB010yDbYZ-AfxQ>x>4kOzB+6hJBQXZc8HqDk!AOF^=NL&e zxP_4DT%ZiEP+mD@M(#(8+=A09R{l;(rK_-B3%ZzOQhT2XA(JK@SH??4VI6=djLi- zE|GqNPKlf`cu67y2EUTXpuq`=3>myEkzs?A60sXxAQPuS=+zSjZ;{EQ!DTYJYVcN> z$b&b3ArHRxh5O*kgC^&ZFDCACBjotY=ev7q&X-So)%nE_#_kdixx@o5alcF4>k@an z#GNj2yGz{a5;xDr=Rrv zVqD@#mpIHN4swb8Tw*Vm*v%!DUE-^k@VMpB1`|IRi0v-%kV`z^68F2ry)JRLOWf%a zx4XoxE^#x&~ccmw3n}9&m~KUE*GsxZ5S}bcx$t z;#QZqx!lDxy2SM^ag9q{=@OT^#KkUgflHk05@))^X&A3GCV?cEj0BfB#wCt)iNjpt zpdaG8&m4a9?sS7yK&0FQF!TT&?a(a`UggX#gt0@sQZrLxvy1N-5c(-HalyF zMTm*!d9ci3vE7G$&X|{*cxnM%M16NV(;e5;QfuO+>&UVt9Y>cf{P6NuMZ2A=9HQNi zHcPC;&esOeK(@io%L1q`%dm@`0dxap&7+(J(kQldRKx^QnKF5l$1kGQ?1)XATtu^| zm%D9!;S#I#oyi<$?VhZ!-R+cZJnlw%(4%E!iS;?_52?FHxEf5?GmD#eJec0km~~im zhEQ+n^(W_{@&3vB6lL^og<3~T>YN>dnw%N_h&7|nu6cJR2H zsTV|(V?5(#TF>^43CGQJA5(L->*PeW*d0L&CDv#Yo{=%*+sCOj#k$M6ZJdX# zrTZZ-a$KBWOSduz9b>%UAsWWcjPach(M;Ad#wQ=53u!<1&7#Ly>^gBi3tod|v1OJS}|N={ia`i%oCSBt>p;#*d5R zhv+{g^Cno_{u>KPo+EY~p*tz7v5RG0bOU13|_ zH(jSGux!0zy~ygNv1WF9gzx$f{f@2^aQJ6rx<+_^N*Bp=wTS*NT}4@roo^kaYv_IA z+#ub~nBB%VeomWNgH8Ah(d$&@UJFUTS1h;Dr~K%xVxP*k$#R`j9_Miuwg;j%hp<`L zYi1?LLh=`hZS&cAS>Eo<9~Z^G>{(6mvXBjLiOUOF6J=*?ykQZ`XS;1&4q~TRl#QPW zVyVpCCc1H@bXYLUra7$zhz9Faj*Kl;`AOL&Xs>gV=a9nPdw=)aoTtE%aQf3 zIQ4O{{3ezmvCJ`E8_t4R;u!A;XSvLKjQiih^4OqVRNcZFDD$%Oz@_X~g<44EUcPoI zD`AI5MgLNEAEV!iz({sNX5qua9>oqaxz>5gA@_Jxd=IjX5ljLM;h3mkCNg6h|Iq7H!Vo zb+W%pCwoD=yTs`N7AG?wyYTrHD>TV9&cShB{WLqRr{tK)p>~~Y%@+BaS+V5aKDNZF zTJN7A5jDnzs^c9}o$ez3oQanHe#8#OWZBFpa^LF}lcGvQAjjU0sc3$A~F0tni?qVw# z-6mq2*fW&ABo6(FU75pr?Bd{?>@sC1N5#Ik*fWe)h|qSXF{wl+JFF~Te$kmX(?Vrp z@*s<3%zapdzr)Tjxysq(;Cd%Zf~dqH)^@UPxU`IkfTQesMhkiD-&qtJ9_1dgRJG$6 z+X3TDG#q0q8O`VCyIC0XvvH4qurAh#`%gK}jxgpoEIi(4HD;sPcAa7$QTnLZ*v~R3 zJ2Jw1|HEQf(}>VdvpSWg@m&LKpS;z1bxfpu#+EReEH?j_Nh)0_%D!Oklx2;GZ5P

rgcd+zdSy6T3Xy;c#r48=SIWIX@TDR`+ke+io9K7%*X|Md8bBBZbg-QJog*wDU zm;|#X8xsk)ND&g-|CZRaOoI2hoqTJYRLTPFLcUWv&sdv{58ox#uzVZelpw8TF*bfG zLF!^>M@0SIQmLfw;3iVILwK*0f|*!bE6dNkY**d)jQ3^uA0f9Y8G!kBmLdTvyF-3XQjWpDSv}~{6$1pOEQ(8 zG3LMcm!vF+V5Vq(NjfC6Vmr^SmHL@(7t3FfiY1n86Rq{qvkHApgzl7fFnO~x-od*Y zr67pH9m3uy9j9#77;oPtH8S=IPkl{FV*Pf}^qS;US;Hx@eUFqbvyeVv+bcy_*p3n5 zcTh@_Snn~B|Be*LO2P2%YD@(M|7@;8xGf0K}2 zkbN2bv+#dWwkgKg&|jA0C@Zszq*^&wW=S@&zh1tHs;`?!*|jF|`S0Z~nUpxkBtCAC z=TkY~**eahe~=rcRc+e}WhDzl^*`kGbnXVIH(&Vk9}7JK^PSyC`PJj{GB~$aj*4aflszT4 zP;VnM1TuY(@RT0e8#2E>Eb@Eg9xb?FzD5puYNQHO2y%L8Boq_?IzCq;n?UE>#oALc zOh5X^VpYF z!g?EA+%n_4E-7q-GtnVi=UAE5V*RDZ{kBNrF$#bCWYj`8IHMiHV@S@@j$y2GjJtm=N3&gHV%693YZ6@{Y?JaRQ3N|KI?{l3aEjg7w}HU%oan%+>U5id~a^J3=`ogLw zMHGf98x$5&&LOB2YnLh2GV7=o7nUmm=2Spr(M!s#k6BCX`f=+zOQ&rjIvIJ ztaxhT@5LxvRCt57nwa>#E0lP67Rg^;p=_HE2ls_^A5b{Nr>8Y?Y??2NRSqnK@X(hU z*$2viI35%M(m|6`8nI1r^$z9TrO=<7z@7tE0CPd9pepcDKsS)-ZQ~RcJZ_wVpOas@ zLn&m|--(~-ubifA7kq_GP;x1=jqb22ErH8ngJ96EDvjiU94~038B`An0_iV^9vE(vW!m}4 zdz3Jb1iKzc7SR~hI@g-*jJ0z;5xPUKjmIV`Wvs@=k0dIwtiZ;n5*2^1OxtT?ZqM9l z_qGH(v}f-9BHNj2;}I*B2t8t}MlSzWBPEq0f2DGT>h;fSWD6*EhbX*P(Iw_t&NnA3 zm2gwD<9@{}y0FZs7X|7iBhTZlPYYN;!##(GO>tA-lpYx#{Qt|_a-?^VC*Q9;X&%$a zX4p&~|1na!LFjs{ zbt6B!4hCQ5uQagHJJ&+z0eKdvWavSS_yhYMS4l6Z4b%ZLZe&Pa(LI|x zejLGD9v5qWqa33wC7t^}rJQB94dTjE%DvS59*hrA)_XkpY31kiJ-+p6<%rt$0F1B) z#H!886xDa8Y9tv{wOXW>Dh1RpXM|cSFW%zkbCfQA@Zz=uFtk7+p!WUz?6b;Q_2Rqa zsQXcIs7$d_H~Z)kYlSt!n(2(Oi~SYKM^t$OD&SsTy+z4nN8S;`Ta@Kg&U7ww@W4uC zg6-)LKEG8~D@wCQRweMlYGsSxTQJLHIZq!py?yYTt-~b5=cp23%skNUhq>EJ%67K> zFmHNESwh=*&r3?V1wLET1}El>JIvE+lt0rpaivDl7~RL&%Sy4l&KW+=H@~c;yS)g7 zHCfrhEN9KTeBfo}AUs9l^|eX@iyr1@YoTv=+qwTMN~j;Pue!YY)~-vk6o1L%R~1&} z%Q-~)jyTsjFOBltSCl;1>)LVA_lg3~i`F?$IQZr5$`C{y@cgDud0lx`Bk6w?TkDlP zmG#(!bEk5c&i|7}IzWSU@b^!En?TF|#E&(?_K?)n#Pk2CET-G}_CG4?=WT}xzDXk! zufZe_@!8jex=Xp8UjGVI6{LlMQXq~6Id(xlsAZSPdQJI;&ikWAmc6Wz;YOI&Ksy`7 zhCjjZqnr8i*Ohd3(#E&Ft~gk_jh8hmr z=bn3%$LS_fwnvF(tnfy0s72|Lqfd`5$~*A&<|o~)3qJXC-iiZDp#2|omW;jNCy&0M zmFAH!+rLAsQ{OaBmt>QWr~72<>=uW{MEl>A`HVgxdfrsdYEq83#{VLe9kiIob||;f zVzIG9@t|^s^VB%s`VL&kGMv@p!s`%pOzrYKjSS7x_zUkU&qMk)hj8yytj5N}@UZN# zl68H@Se5nui&?Opqe3GCAo01fu541m zV@FhSc~~WTK}DcA(7;9bBfE&hsrc<<>dNG!j0ckc!I+37ineY|6{hoMesEf{H<@$s)_4 zye=s>!)51Fu3mx`TqU9}DbJhD%VAalo&K3P{+%+H(g1P#s$yp>dsuW&E00j)GeIBl zP^vymV|X=H*VAb3pz1w<6 zYor_02HFMM3MvC_1RcCaBYUpV#A03DBbDE%kuFdZXd9>mlnn9$^+4r0f?y^DW`R;c z4dAzeia^<*?a)&Lf$6}~r7)f$J_t&Jc*7!%#DPvnz&=2Epgn;wNI_+wUbtJ9TIOo} zes|TXEQGVTh3|A%U%L^)3{V0n1~d>3Cj~s`4>4#m0D66Z$eOE8DQe*%m27)Qa2&a)&43xm`oiOgEy$|6w-Re z`K18$8F-o{o?NKDMOFI|g|r+I!Hd*&ZYqr39uG~taEm%-R(G{1q^d=1zD@l((R@ippZ&-~vhB!U%^2CtwOB7T&9p?A$o6!JTHZ zvv8d6eMEh1-Ulj?pI6A*CsdLFYIsfr=BQIlPIh`Zc+jKjan=S810PkBSnwE+-k?^& z(^B5OLA?Zz*?7leYDz{t4chlh*c~Vd6jY{=!e?Qd$5e8u6u3bp zd!B)&1FZrjfLsrklt@!mXjl69@;$A77wRZB}0Qc}YA zQ*%~TP3YseEAbP23nykZk zSRS%jJ@J*GpZ{VKKs*f3A+6G?stlGdbMz~LznyFuU zR3TlU;x~C&nHtIh9_4$>)N=Ya5n8TZOO?YaX*tZT73x#wMwRq~dK$&?3N=~s4Z2q) z^^d?{TBVY}Y=zh!R>)aU`@MQB#(JHyHS|Jm+iCuNZ?B%^09>u6< z-Pq<)vF%S#4>M1;n$`6Ztr77pDl@AwkqYsRJyNW^0Okxx4E9w>Cd8qjB+wkU>1iV^@j8WU0IsEiwQ-@vzk!7^Kot~zNhZO-VbC&A=d?_EF3Y4FIiN_8 zTYC}z(4qcTIyfcsHRI|QWra$1#_@CGs=p*p$zpn3{eWs$2B6KZ%KW1*)hAf?ry~9m z%;oCFPZYA_TXAwiRVnNHSe*V^y`Ry$dGMr~%pxxHO_S>SC3maD6GWge9&gLmi{C7p zl2-;=pH3iOdP$am>2Jcnvjlh?9{yUeZ`E*F_kuGE0v#U`(ci15BsJ0j#dUB`s-4$+ z{-uyMP~*S&6xH^+@3G5d08|UA{x3hsv{+VU=a-qbjKLHgDrr~rX2|mZ`P+H8tWCkq zm?Wf-dHW0@pX19YkIwP^73bP#+th!?tPFPN9Yb#_RG zB~g6JO?yhY1?~nu=bPtf%US*i-#?LB0uYL*OTs@OfG= zD?TOQp6w3B?5mQ{b24#%N9^*}zA~$@^OoyX!2`6zlqG)1(*w18)CDSuSfKLP1GNX_ zc(@A@R{}K&hKtIl7ip_m)L(gOkhY$gR4#(F``EEwuHL9Ms{Lc}PWW!xa-$ZZ9)kGl zC_i_j7SDF~iojqkkgDm2Wpc1btXiyvOX0i09O{$_Jar|P-iPiBf-icc1eh}o*AL+S z6EbN9?Kr{rglZo#PnAD&lNQcomA`S5X4O669!6Hl*?Fq??j~)&w8R6xL+ppDo2!!A zWANE-nIyPF8v-xC2mNG@N=pAOld!+@)3<2l?l6|RAZZ(@XpYL)N5H;`%IhMuWEG0w zHb)f~BeV&gS|N+I z2|Y%G3E2G=k-9>AiqX$_&mEc{OS1FfJG69GX%}&EnwipZzUoe`#v{&|4qpys>!aIGO?5Y#t*_u1k2f@T`hbn)CTXQrky8@2}a!~K__0|hn~>J%^$)&1gPro zVt=9bl=miWM;!Q^kngIer~h-|`)M~wZ|zZtPbXXkK+W$fB^L-U|>KLFRI4Km4kOl;q*btvw3dz96@-|ZEBVT==&`ealv|YPPnqRGu-796XC1+LO#DF0(}ZWU_KiyGg#Ly$uEDR_SN3aJ*Qzmd6te&4BC7{}x7K4BT&-@F zN$hnZ{{wB0hxgtqQ0<;Des6$@{Cb&e@`Leb62a#*b&h+00@IT$lMEY7rXwPHR9h}H zxAOw-{kP7E&xK{ByH#i+ez? z?`E1-;HJiBsj$v5ZDTCa#!t^RZD&0rJm16g)qFo_(h-U50_A|(K9EQ_EVP+tl2o{d z5l>xfx=mp%=fr+L)0FI&H>!L|a@?Y)7cN@S^yRhiYwfH1Z+)KBEpx6N6}<~h9aP=5 z53WuFqI!{0I{5wk_>HDASzhY&8|R6krj@F$kX}U*Jk<2P!Y+^U9TBFztO-_sErsu0 za1rmF5~r4$t}x^G=aZ49+vn1+gq~p1RrwZUt&-0w(`O6@@8u*@h|JnXMZjv)N=pAL z@>5Md6t<~wh^-Hp7EtpiG6~rXv*goaZ<@(Q+1e4YBi%FsKk5(jnhaAaJ2lD&GfclR zHf;S3beoTP)`O;x+8?oG{mCIOy+|7w1|vA`qB8tyC@<9i=6#jvC?KKzg=mz9l) z2KdJb1eB>hh^X?Ye{DUrca zk-pXBK1Z*aA(^`g}U3coO&m(g6{dKM7|57r2ci8FjyL$vsR?O zZVI$8(+Jdu~;I@u94xFQfQrfV6Au@3{tB^F5M}Sy(=WL@kWVkgKc&}y!8&KIFK)B zC&&Rxh3y$^n**|eOwn)-3k7@6ROl9+VTK#E<*?#igSq{7i5!HJJ_Wq&3#|a!1o0MN z0ub$;q{I{vE_fJKqJ9;d;UjA z7Q{Pl5IsYtlalu_c!?kx6zDIJ4SrA#P=FubH*Ct8D?=Ow8(&@^!Kw!)ZWuAWqsMoH zD+L9Bs(mD~_F9Qt0Zo1ld-8@lx<(?$;LKA&M?g9#3(_ipQJ@J=;X7g4HP2mjgI)xu zJOrm!B#YyUd4Z-Zmx;byTyQg=X6)p*;>cX{F-6~anGt5p_n(OB`R2=J=07U7++g0T zvAx|Qe6jhMoBIX9Nd71A#s;Y4jF`IBe9FB%p^uSHc#FdK4Mv)WB;pS`2X9xLf%Fbg zE2t54&G7frnZT&8B(fj!(?M3y?tj6X7VuU^B1i}A03QM@1o{BoKr*Q2T}CE*;0+$o z1<+Z@@91H7klkQ*zt6}PkR4*LKf}8vkm&^~h6ePgk%;r3umjM3kjG2V>cCeQe?PtI zpJLlx=BqNZ4U3vY^ZQEqCouZDp{ZbM@IDTwTLs+%bn0W6i$EKX!GT_YcRgU6i1(nq z!G{@YfQA9z_jg9RL7kvBP#h@jD7>5kiUCD{LO`(Oi{#eB*&c?^c0z5z*|mVYxAaLQ z`w$})vC+6|Gb>w;8>V}W}>34ak&dFHM2e4`qn zJM4r6klQ+m#6V@A+zwq~yV$kae9S%e{O_Q+HH^5!DV}*4`p1jV%v<49?~zCxFaf0g z7D~AVK2*s_HIO_A{T1Sn=f$oU&EKdi>Y(Bw{Up3fl>NNX5{wLIz-vdqbV&1A!$?&r zR4KfAlnYErW#l061Sn>;I8|%5G4~WOdr}zj2d#ZZ^wgPkST=hx6KNj;5RTzkd4Knr_sE}yfy^>3VuIq)cI?u zu8r{j=l~^xxj56SPP&@1=66ALWgBcur zzUbLwK1B7Vd`2px;OO~cTZ?(Cmeuh~MwWwOKqZh75DA1MHRr)lgT590bIAJ{=vF9p zE*ufw_ZnUgdjuT>9Rf{&Oxs|ZT?Qrj9rOb5yP*-~T(Pmue2~uVfP~YJz)@k++_*pz zKi_ZuCG$LTGn@n{2VOPX0t&xb>~A;2TC?P}kdOyAnb2X>JmGlT>`m1Y*kR*?LKo)E zQgqBiQ2U@{`1r1L7%d@C!c2H^44SMG(wjlBOqke0p*cZ~5KqBgt&7ExL*@%CFab7> z1bNSc+4n~1ybv#k4R(Mcf}jD`isOgPJE`vlNGMqZM*$uJH3ULu1C<7f%}31_*_=$6 zv%n{0FjBTg6dp6%sHEzWSl4a-6=nWu;^aTf`BGr+15gF2j5t8O*D`Vpv>oD;&^m7L z`DLrc+V_nF40d{pM7NoMXJX-~9gduwDBvVK(t#WGC$J6f}`ZhkY!7F#_wt z{$svYxjP+h>>h*tKM0pW=&bueWpJ+&4hn;KG8Zn3pi^*jmItf=Ze0%-e9#d%!HvMR z>&1n$=A+WeG}sQ*3ab8+kt+|tOl4+d>Avr#W$4)3;JANk_-?vQg>DN20bXSw_7yP1 zQ{jdI&arqEoEzM$g{|O@bLMKc92S@S*L>2%iiY{BVe@_;?_sjUbabIreT83C{_up) zLU=J_zVjgb$~aoIe==0!^6`K!~@R^a{L zPftfjieulJpOok_vFv;EIHk9Uz-jYKjNUBvQT!3$0NK+SG%nZzeVgQdYJicON1S&m z*1LY${WysfX^0X=sMdlYuxHDc8$7`1%H_9OvjY9Gbuih zk}#7Jz#{R3lNnGJby)a6sJoll_FW=8M~|0S^-i(zQT?pU9Q7hOU)K~f+-@AJW#mXL z&w5f1V8`AS6;JAalW3<1|FsU^|J%9$Gx~Y91Ridc>Q`Y+(AbVL-J4a7!H++BF6$qI zAAR&lSoAI0%XFvApttOKUf=6qYFlW1-ulU87;$rMu#qJ`N8PMHe<$1jL93gy&?X8i zby#lR1-`=hQe)dSCa4$T7|f-`6Xb`3#d+ z-+#vkejznEJt6+XJMzwz)(oq4G9rxBY0jPFbmwi>Et55byy|1s;58r-)T1{`YC{$! z-4F4TALz%}ot=F9N&PM}%;Rxz6*!m4&z;ntq*)^QL!Di(Rb2yx@nm9+UFR}80X@N~ z=fDp>qa1<&QG7{133IQFCrs!W>=L}?GNDJo3nk*rgbu6qPABr%%X(;FW+EjfDCL74 z#=;=YId;^b$+;T+7e{&ZWqm928{wYc=*xWBNL#$e*;2pmQTav-yvQ~Evk>Uciu3ZY z$ofVH(~nDeG858O{Y8nLv-54!dIf8=|K7tA%C_2nA7kldebYjxmTNV- zSOl6Zz8X`11t0gtakGaysmyaIPhmdL-LThwt_Gc-zk> zz$!W0?Yz^&a!B8FNVk%Os&mUBQ8CXlOl8Tr&%q0>u>?ZY=n&hlv7BfALlf}l1{NtR z$=lDjJ)PiT^|g|^fp6NrliJAPz6rkF+w!sk*QDWI9(=7Odah|PC1IdwP^&lNPhM-; z&*n3+$j9<;Soj#kFJEU_18;(e`+Y6BlIq547x-nRAbWM=!GEu+uiNYmsF!Wpfz+XOQI-Gc*3`jg|*I z-NrOWt^Sy!YV)-~=Bysu5p22H-2^vcpkh#*iSgTmEm^t--2~LDG5&h6cN`W6 z!z>>%I6eNtEtX_&IGI`yn^|%9+BYlSAxF#RF)@0JoaaH%)KQjz4%p7nM zCvlRD5d#_$X+%Y(77a;AMCCSUsihH9qG(XjqQN>E^z2yd#g?{{1T!R(WMa^aFoDsR zzQJB~g8#Z0)zw4C+n|8MJ>+bG5PDW#RXVa}@ zSl@Z-dCO&S1Ih6D(b&}YcRsINrVrHM-JNnMf z{WS*?S3TME#KFEF;~S)DiK~ZxykW``ch~N+Ju4gTeQv{R)>(`8&vhR+KeBhdb+{w8 z>pa`YrH$J*-ElG;``Pe#gna@w*POIh&{A96OeuT%K^+ac1*WV$$=t zy&ow_Fh@n8Uk~kibZ@^Nex+X@c(-34+TE`YAKCU?zuvX0UvJvwgU9F_mHshL<^G5K z5ANP}IC&=O)3!A!srMe*JmpZgAH4X&AN#&OWlg!;7nlCXoysK-2dezmSlou8ab^3* zo*QF-eXM`nzI{Yb_3v1pbRcCm@v4_+)z+71Q=V04t^c(@|9igVnckXPe9B6P&vwN< z`^p2%J^X2IPb|TnFdP5Rn_`Jcw%Nq2%~SK*wJ*)u(h#_&dCA6DV*F1%OJi2YEwMN! zUe6@H{nm(H`-W}hgNM`)UABAh>fUa_sHhFPHR_2e_K@$Um?a5)?cO}GbOHJsT{q!; zhO4`Dh zYTXc<+~J8?>^H^UeSD8+WIWrlAr=?cKc2uwE`*)$%3G2*0{z~Y)zTTW+V6|SB`uBF z(dZ7yU4ckzJ2kRDDc^&J-zMl zgNJy;p^9RF*2Ups_q#Xzah=9rxtN4R@87Z`o|ruBf9UE5j`vD$6|EEoE!S} zx@gLCe%%#KuK5D;*jwXcDI5FsF|jlj=WslEOYE;Z`^QyOpB}Ns)b%5JSulA<)HomS zM@bF3I}#UjXd4>zj9|a6V2#=kX-B`*BO|&uitY5u$oO@wn9PIJv|jw7jAvN zU)Sni!3tP!+p}Bu1RF5y{kjWHv7@DRDA@Z-@=Wc{e%-U{9juQ(qEHST$80wW!W-SR zxC2%cgONc`<73v1aj}#ya;TTaR5WIC=N=EofNN+(cLh1pBjX;dG?p^4qhEtS+tWt$ z`XhTha;ZU~;MuAT4>O>xqT26lDL$j$c7@;8A(nt0?kr zG^|0-M8j00ntnYU4Rf@fP3t#mOG#_Mh~2hiH%1xr&r0lRTU-0}%vX|k{tkVN)0V~@ z4!^E;-x6~gb8o})VgJ-ZuKcrGl+|*{qDL5fxo!P=W)`YJoW1pKqC8hX-tcEV6mZ}f7|Ieh{0}Mmc-t0 z(S0sy^>ZvVWA807E+D1#ixVo=f2zop4y= zCY*6UHx^NP+P^ecOvX8yv%mUN%@y!-HWwwH9i{u8cU_TsjVnuYWw);99Iie2%i~+a zJv@A1_aFNFMI(CC8~)DSz*wLwHFGCoU3z8-h zlhnfqEw)c2*pc^Ho|@Q>!Wv6lKQg`?`RvbLv*gCu=wlun)i=fd^7yMziE%8GhquPl z+jhmL=XX|k<3Gl&+v7j9#QTRII{Z-MwnL8|yleksO;7lQ@5uUx?|h_9p68aBb-nMf za`A{Be0PWck=>7N44dcs+v5tBS@?}+&&Jqa9>t2^$dmrbZ$8j?|NiXB0U_XibQ{nR1W!B`;Tk`k_r-th;J(5>ByEi3j8}CCS;vi?g;*w~2l| z|46^y5&1pV@2banjP~n=yPnhSg2(BK$GPQ_XM#JeG0VD4zp`SHuBt_Qa2WGc#Cox} zx^*?v6LZ*cdCk2k=1j82lss&|XO$JV?YXT52d}_3MN{l(pvAFBw>RAqdl!2uMMJ;p zk9;{07qj1(_QXC^iL-9Qea5cmy8Z2ve!XW%+`7G<=NgYDo)l+~CVMRDCTzym zg&s6$ck)c=@eQ%)4tmi;xtN>!_E7QeKcE3z4Y%*+7+t=TV>B?XDbL>5$XkhfiQ}Pc`UDDsorfY)Or+>gbK!YDxP)tjR6=T3zigfT3j(Z{>NGp z;t~+2!W_>o^eobC>V}vl?uV%S0aQ)z#kx)N%5}+G2&;pY@svZhw0*kj=b7-;-SQIc zNOSA9+>3DlRqjsNTY_Iykho4+SrWh*h-&R=EGOl_O<2xK8ixPIEuk%Y2e9LB!&T+o z9SQq+B~cGJo=-|zvwLCEn*Ga?Jh4Q(c2n&A#}f}{?YR;|o|3k1BJtrhyK$!N=Q)+3#BX}Or z;$|KH#IRxiizv?cJ=Uvmj~B7$rp(&mFz~J;_S3lKdw(6fWyj)%+m_m&dq-Kg50_s5 ziap-l>3G}-b}xS3u>wyGsEm(h#=CJ%ap4rq^Do{PcjvWVo9B0Y;=i5@ zZl6eu|La?e_uDS@HlbhL3Hufv!ptthH6m@}g!9q5C6`ojH(>Q|=j5UWTy~eaf>jUa z_m8``^S0i-oeMmnQR6cSpGQ8Yo?YwrRPlf)&*uSQ$w}}Jyp?(&VUvBoZS~rLd!N7S zV2Nk({<5ac6VG_B-qie|Jux9*!OR;?aDQhB&bjUR6V7T|6*eaBcU;?1{dRUx)`LcF zXBZ7yjC!>s%hh{4->PD3(m!oN0`5%gj+MsQe{g+%l`Fshtrbn)gWHy@vqkV=JqS)|o9b8uY!-G^p&H2DPp)tXh!&+qafB*$;XTS+bWOwq1?IxXX5p{o17s z!HxFky!)01)}EYc!?m%yvXRflTTa$Y@F8~o zam&e;i9pNE&Mz*1XSO0_**|r1!KsTkF*W{^e;lX?J=*Y%Js+Ix|0J;Z7U%ue3(f?d zzQwuK`u<$tFSj`N+&X*4^S;9@bLNct{+5%ue|Z?|52PvHJ7&L^yi_nvrfi!;S){nkB!k4y2ArRtu* zmOGsUZL zTJh<5E%#f9M|nPjL7|&k`A?RUF0!AU(T-aKnixbN@^XThCXvDWZL&oa7>tNQkPXN; z^brvv*^uvm#}9X0%DLB7k%>u#s%D2>*UC5*E1!X0#!s z4<4jO)~Z7Yv?holvfVB0M`yGiIOP6(M(YLZeu4P5MB(a{@qKSHV?^|VY*@FUgQDy= zj6Wh8K*rY_`UTNv@TizJ*dnqAX0(?vQk9_4P{9}i_>f$P-_2+>(A9rH0yy+%7T6+E zqhv_<$xxhVCNX8fzkqFVqTsLKahvEOTW!KSI-?aLfAoE5G;|nbg9Z(KR8)>JzD0C^ z+Htg`Xo3Y;#Tb~1jFCw>$F(0I{$&(QqM%NY1x2XC@gjdp#utLq`n*T(iOG&}<$ z|0IdEWcwIvWUWgSgDq>%6ZIe*Q-yjG6|P+)dO>4IK~|V~iVTa)*%>W}3hK}Q82&Di z=lFMhAx+S0F1P!h^{N8?f@h0S;CVowF`_y+C<&eQVWnB ztksM_Nc4iEC+D^D8>B-iNQW|mVc}kj23#2s%9muiej6)TD5`Fkx&mzXputCec6>aq<@}e_SzzRtwn#MoNJjXfe;Q7Ua1Q`Hj>ekR7QS zfaY&Yu0VjRwTThX4PbjhKyaUuy^Ifnkz?_~KZJDnH^0O5Y*G4GnO+R?Tu2?2 z+5<+8C5o!QNnHU(j$J5n$ED5&(Xyk$XGTEHz;$IcV#q|`Q$Q&eN>YC{Lb zfT??_!wW?4@6x{!H0JsPa*e3?5Nu5qV<0;k1yR8&Q9Of$wkt&5NtuuXvgg&OrLF|I zUyEj?E&w_7zB6);tL9~V1=#Kih&szjqXGePL|j&>?I1f?Z0bU*c2aB44u~?BbSME? zfU;0(eSs_>^L%EEhzyV|Pc?OwsVk^4HA8MPPGl{n|BIp?q<^iUUlMs2NSy=n1WnaE zbPS1XkdB##4vQv(5z%bMpVgCQdT0sqwI>K=narmzg+>DgP2C4Zjyc7ESEl!ZT!dK{ zN}U06YP8~129}D_OJzba$csZC*!a(Rt?F_aUja5iS6m@=8OSN{UP&Gi1tdIlfiH1L4by&x~QQ>GrH#)vdmB45g@qV#d;R}4m8a*1wJcT(fVCFcpGbN^={ z(4G_1026=0h)hKlaXy^l` z26-*;gYC-$qW17fO>0lZ>Tf=29F@w8%-|M%tt7Tvm>G_WpuvcUknM{_>8oUlsOcaR zM9mMtW67fVH5s4TMXF-zM~t_K($}e7!u^wzTJ49p$o*C}s1!s6m^%a_$I?XC+hC+w z*nUr9yMe}b1KH5}Q5moN4SHcfM8;%%@igMmpxTqrE3mk%x^bfFVB~m;P;DR@U`y0B z7aEX$gRZ$jJx|k(o`=@yMnyr8Ezem`rijT+x~Ah3w~jlYu}z0T-WN<&>)IsZYrdfy zi&uF}=GUQ(H3H2MEPY(Y7lRynVQQbDkBV;1Oqc?B4RAjpE6Oo=M3k7i+F*-lGI&%( z4TeSjPFX=N$f*h&42dCw5uqCli|iViUSSXoF|`&k0xt^XNjkKMe1k8DCXfw^7&BIrx%{Zv;7&y1}4GJw|;*)EW$G!WKfr2iWH% z68a2=abF+{r~om|eWHftb47Zt66V`3vO3<#})b#K9B& zqs%`9;`)!v=`b^_5XFCj1GdG$sLW6n1HAz;^pVUk0P>{ln?d{<+*F>Twu<0s(k{d- z=@6-(ARQwhK<AhxH$Bf@qu<0HatFeEYzwuvzkPeh&-ARgJ2v{Q2XsR%G) zibO^wooo>@sLfMRgX4BiT=)BTWWd zg!>BVp9&WLeO}8lb;%W)bSy&z3+@5A>PP8->+{r=QhTq0<~`jtvY=+r@H2FaP_C6a z2V}zvK=wG#)LkIUx93Q|bdU`e+UHMcZEKT6^g3q5P=R!GeE}LhDJ1cX*GD2lUM~H@ zq7F2+C&5p!ll6Ets|LpRC_2Ea%- zPUIHIg0pU7#*ip6*d}^NEXwpkQWdqEL9C@75)CaV0%xDu)lDlBXDXkP0*+A-YgSa&_ z7!rLXdRke{d{>HQA^kc?G`QhusT)B~)s(4~Zz3KIbCVjjiI)NF%Sk{CyBTkCK?Od_RZ=`V77x1`LKo?rxdhP2%b|OyYe& z_8!tI>OiA_Q3Tk+VJ4uUNrNpScQ18F6q1;lsRrsJLVb?J{lCE$5iuANqo$tPC(9cH zBavew>j3koh`A;Z4>Xm>WP15A?ti}XX(2r#H!KsfK{h1M)H$Xu1FOd8wMwuW^npIG z7OVmNAm@4vWPaCiGAznSjBFFgDamPr#t7K7b~?0(A+P`iL=AmJG`_<4qr&~FOm70S zk)Hb+G#)Ur|BLa>q7zhVt)kb^M}_xCGF}*bL3m!56^20t`MW_II0*8UjF$gnX25GU zgCS9E>RNDY46j^F9Rf$8!=}#cX1d@|5>94-0ozn8**!Lb1T2n;d7>JXwU_xf7A&;jDTMR^xT_j#s=aP75Y@Y;i zu_`6QD@6~9{a*Wb5C=}=A0QU7?SoU=pKIepFUS$CoMV#M$}t!b`5+h1MM5!x4y#L2yFalvQ1{ROvO{ZNJPz^F;9ms;al1>|QSK*N9^&lHO zZR%#HtgsVgzNn$YB1@IJ7^HubsfR&a|8bQ|mI>t`x1Zew;%v?ZSx`CH`&V3a7>^$L z%=j)-M@?;8Ak(u9Mua48MyWB^WeZ8Wm~?{}+3ZE9vH!8C7~qgq7{sd=&=@j91cym>ES(IBN)qu^mzrxO zLv}ECl_n$VSILM;kaJY9ni<1*z>qrgGU_%_YA`GYOx>8t_!eRRoLL}f?0J+x&7_UWS$>6H38ID;_REHXw2Z9YSKxAP$}@r7kjv_W%ZQPiX2W$b1=B z$?|GJHmuoTNQ{x#HQsCxYoPpU>8FJe-~m*B4KrY_LC(GHTA87XL_y&k8Lxg`>Jrdc zBOoh|n%Z-n)TJQPeWnhB92xZsayQk3C|47OxpGbiK{j9#oQ&eL97E$?&`i(0{L>&^~}##dPY?(IlqoHKM-a!os=j)2BURZE@us7&{PybthckIRG} zkPVP*6Ae45kBDv(Q!om02#ac<(Zgnt_x)YeNFOqFlo}1Fds3$RK{jxdL_XJ3(m#Vl zyw<@$Yr60XI;f(UM1kcXD{Q7lfn5f1E454NMld`wuVp?hJ5UJHzt_~$AQzkdO%R`+ zH`Ri8R-7g=rPjDA?~C`O9x3zf%jWd=a6`gZz3^wx&O)Z1kpnx{{V@G zMom3trZ?3wzC}zM#CyeOq48>Gh-}sHO!jRAu$V^cL7aq{-=%g4+aA&;a`wvI(FJmm zRyUBiUohArik_3YiA2L10wCU)jDZ}HrbedY&WSwk7ESwQc{=zq_kYoW)7X!ALHIl! z@gl-ti|~=S%w{)%7`ifp@ghQ^K-)oCfgfbg^A3U71-8TVPZZgpXBcl*4Q&yFB=S`T z!Pdnh{Rp;S3%)v`Bfz0|y+8*%R+Bj4rohp6@VfjMG#U~l@h+z~M87tX`7(GsL3ESw zA8P?yRUukQ+#QEOlovFJ2N^Jk^1`p5)|$a?PzxcD`eWIXD$rQHB<3^>HXyzEm()%% zOrodCuVjNOK0vQx5Z@_zQ|btb23NkNF``Z6{8kni2APmO z$avg5f~jag&D)H>PBaaJEtd(~2-72yEh|NZsSQH+nA#xLP{*I>k2@jq_)<~y=hNEY zAJH)l0am1gTr9;=X!N`fReM7nK}eEqM&ZZqsRXDK-@X0?}Iqnb4k45 zs2pcJUdw?zc#0>W@m^6Yh|$3zDkn*s$oRWV=pwOy=RN>YVDS`)HDLP?#83ypOjJBd z&7=Dx63>ADlojY8JEl%c|9TQ5TQWo1MAgS+iY6*fA%LMNIxTxx1#ujgsn8eEAuK8ulX$sAVl|gv03P>>QIJz%_s9wxK#t5b$=mBCAhxS(DYZ>hE|Y#? zkmpObm+=l!eIbZX97N{|nNXU>goUDJ&8%@bo(5Uq*cBieQgWp%s19TUl&cVrsqmAS zOLsQ?7mD2LX7SC5S4F|~)LzlM4n&Wu)`QrtH5(YOin9D!-v8rdLV!clwF%_A-WyQ> zdsr~5l?>r^ImowTd4;oDF?0pUq3!~UpmYBN>6q(c5)Dq>3}Uej?V|L~SuOZ`T=zj1Y}+b_cnqY@{u+s+6J*ba z%b+ormD@mEtm;V~SRkjSiyCWa%3wqiBU^YE;;}Y{NK8$wb~ggpUfm?Vc2jtdocl1y z3Wp4aMe)7nkb`Vc#&$9y(n;RQR7hPwqGx?18kTn-sh2T@@~gxVqUI%J1>L1W5(V8(ZX zn4YyyB1mriNaweH^p{V%9<4=@ANxF+OXN?s@O3dqHx=4LWegi66S zrzpE~&KMzOtA+>^(6p5axafS1j(9V17l|8$yFtvAyMg7G0y z4{{NAQy*V}ZO3@L*tj1=d9DYjTSNni5wUAuN1*L$(L`d^4}+XL+rxCkBiAD!uHzAs zm){*APRv{q2hb$Qo;Oy@e3Kw6?0po(2$erA{XE|UTarW}i4LaMg7DKC7{JiG>qrM~ zk7a?KAdga4J?Rk2cbT3dT>d$%g%nW`r1pyPX3`JuQUkc*-uPy`80RQqcXaE*T!$pX|&_`0~ zfP?2^I;7wW2Ousgb0inrN-|y4tv;jW4q#3%r&h)ARUl5vD9AZ4$(FhpxuIo$kr@_>Xc36(ec`Rt zSX{TA(G+CN|1t`KWBu)Cw7EC&F!@!cJA{3USux0w$uxDRsUsk-A$fP4(FTyO0^}6; zf%NY&G~WMDBETNZ84(eK)UHxwz||}lWJ~MsWC1OrW~&_XysyauePuGfej6DPbs*<@ z6l6uiU@04Z57R@U9Ax=jV7pzzhd|q9hE9+b*zP0SMD_z_&p>7@1yk7*Fdg)n=~YI0 z82g%jM@4F-95MTY&{!kcpaTCAkQG-l9`*RXeny+t`scMC1Q^j>MTW)nBeLR2kQvk+ zas)ykFCuf*(%<$C>0b-duY(#>SNfRrtMHL{7JPzh0lTV({`fQ<45rD?s|C*OE5jdxpgK2MnshRYz?XS0UB4?5Qzr6_pztAi|yVg3oHsq#~{c8ZH=6Znv!3`iASZ~DR<2(%kZm;M8IN~HLZ6aNw{t$^HImiOqMBx$Yg(7@Zu7&hw z&MDsGzsz{tptQ<@70@_INt`2km|P%A+sQ>@?iHrLB#K_8#`AyC|6w3mWc+{$$M9WF z5?>@V*eu3K96$q|WSkg&jl{R+en`KFs3&pO2Mvvrv`aRm59DGTcwPDp8pQj7AItP~ z5F?_A8U&PI<4d7Gp<|mU`6-CYWoi#~SPYXG5%m|OTV(!94y6D&1s$f=!Jc2uYwE8V z9~5pf?gfzt#$nFujCeHIAdY6xqTu_tV`c(=FS1-DpmE~$vmm_XdQbvRwBcK|G9!`gi1&t>d#cIOGCxr{i59Gq^$akeUsOntIeA zzGkx!@pu#3V=!KHpHDxV2)jX)=UPN<7xoKec?!s>%k(fF*ZrktJsJW`$T1iZxg_3l z_mX%lw=I+DZi7cfImjNBg2(Xcw(dfy1;_^3FET3xIR)t}r1pZHzrg=ByF}_J$glBg z_SI$v@Yve(Mc?I8_kfXOpA)rLk&8sx)gZQ&0F6~ojh=K~gDJw04_?c3{P()gllbh& zV4Dbj0UC=pE0_KbF?ECVugasgX(F;t78n9K_9+v8X!@ zjf;{&Jm8ec{C3dDkct35OezAodWTJiL24A_-bUi}If?uJPE+IWZWt9bkx0+GTNa!H z8V8Q4OF@2^B@E(2X%gj3mZJgOuikqRftO6V_me3i{~?4e`+dBcSX6@uHx+T4pSI zRAwv&IZ}cei^%O`JcimJzQScNC~D34CNrL7`L&PJAFptB%6v^40yRB&@n}Tg^L&H& z7Cecy;Hr_1xg=I|-c#lfgGI)}uOP13)&1ODhp`fC`WM0``?b(Vu)L9Lwl?vI>rHQ4T|+?IOcm3;Mha-ch3 zO}G4S;$r+R=M!6RQ-8G=zgt>S>^)A!P5ZSIzXsf?WWowz)vz8|d7o3Mgw?>BV9l@& zSgPVs+-u4$g9y9dbSlNL3Yh&jPNfLe2uoF5%DtDiS|SKX-f}8aFxPKkzjG=jzjJO^ zf(T^%H!6kA{mH4={_IpduuNDktO!;C^TC8qRq}SK%G{Hx(jio(;+v}C`If5m)Tv75 zw^d~fR{9-P3BjT;r5@?9^!mV>NpAPj$v)JSjeCM!L# zX;}J&$%_Abs#1p;DBhzg8GBVF(h%4)sjf~ac@E|85Bvn-^BRK;t46sP-oW8W^}_Q} zzT$g9CgdWF$^*Ya!roMiZAHaVneg`o@ya&XA7Jmo_){6ha7Rx(^mp}2ix&2(N*9bj zQE(3Cy--!MU}_9=1y;iZ%n$2;^}(x*`L4s$+>sID~kfdAE@f3g-+x~u2Rrq@zU80u_1-M`ySSlVUanURWwD9hL#h z)bLLhEE|>&D})up%3zhSDp(Dy4(5k7!J1(aSSPF-)(abe4Z)(YG1xS04rWV3#jsRZ zCM*Y*2P=RT!L$+b-=n{J+MC5AS?=-giXWdV9KR1^QR7r(ZWrF z9~a`W-5F9+5$5s2pG^4JOrI>%$A$P@gyj~;eOr;4U`Ij)!e=*(4`Ccd#!sE;BcK`) zPBIG(A?z^2U8WCQECXl5NUL+g#t^TX{&NT?8 zlW&IcccGM`v;Ih^L^#DPR3Lnw8LmgzWrh_rVu2Yh$EYkc!&)_jaYCZd#~X5JCI}GFs_6Cq4)99y7l8Y&nWz9YpwS#nH3zc?$k)i_u|>zhYDD8E1QM zRnieY`;(7CmzW~k^vQ!RKzOMcKZ>3$Gs7i__nHk=lT_tGGhBxFvJ*jQ+Gi{e2+jBY|4*=(joKrX;8Q}>v<59H6FNkasF zIioJKHeDi5f1}leEz$#%? zuxeNhtPa+Jf$@V^;4g(<54#D*pXST30WZUj)DiGxVhh7cVKuNOSPyIzHV1Qm4%;18 z0c(VHz=mM1H8@*fWiSB?!{%U~%W=NJ%3yV{2y6&83A1G(9hM0z&ccjTB7mGrhFbF2+N06!u+rh?EXIdGYy-A_qg|kRl>O#W=EwetQQ5? z;WGwy!lEc>@EzoXd11{kF^pT<5mXGiK^+``Ra3)ikgpJy2WuR3YKjkm0c2EQHdyV8 zSnk-92g5ey$Z?zU+${;pld#@X80U5973{f#@yep#Ih1GKawxa{#-ZdZ$;z#V9Ll3# zcPL9n9m>1!JCvV|JCt>AJ22FWA}&oV5#p}wGl?Fz_1gs8L1q;C% zVLsT@hGb=6eX^2+aOd^ON)s4_`C&yUqyUx&%YpGH`!%N$uE8zb9T*ju`;HS&ol*B$ z9s2uDO+-gZa-{*ga@=6&{1Jc=D$6?l(1 zqGs`_35Xms#0smvDsWo@64lkRb@fd2m?A=vy*ltZJmQL0%ZdUzV&h8ADsLiG>EDrS9lU1#I^}nuNRwdb9`?k~{-gow{N>OxNf6o3`m3VvB zAOEM%)HyzJ{~tc{^A?VK;BZoMe4sQ5of}v!mxQ1gHpB|+s2SPI$VpR7Qs}GYth*c# zw#x#`9q6z7G8tLI$c)PZ_cAi)vcR{=!pj0};EHlwEzkb&vR_0YZ$_2qGMxFB25xpH z-p%5be{~lw%go$kkoS4=rw^L97y}@S9t1g{d>|8e2)sb6_u@JX zYliV>Y|5$JS}!l`A%xHV=pX$nJataI&CEwX^_+0uIX>=l;=SjD)4jN9IQx@%PQutZ zm1Un3pL__ERU^;7*-r zOHTf_)mm8>xXzWlJgN9GyfA^41h%=7FG}L6NUIAx3we1C4oX^WfqDCfPM+P~UqP6$ zobm!(f(o3;f#185S0vyvTJ1>=%p+_^xW<#LxB^EOB%dG1U67oda9OfazbrYhc>!Dz z_9Ofl!jsWsbtz0YVf{GNJ}@1&1Qw%{e5i|Ix`y>@P}eyvZwJ(; zzqSxRoW2+8OnAM7b(e)KC;3k6bqZ!^hXok>$9RJos9T^m_N!4ZfDPz|1PMii=&RIS?l%s=BQB%<0VjXmxeMkJQtz~q z5m0ycS7yzLcaMQhgyVf_As<1rPhVx%oSdSt6BcNiZO(#T6$5n*)-OWc?^M@7o%hYJ z)8B*I1>WuQ{D+}VciQ!RP$xO<`5&RqcB)buJAZt+Da(0qCQ z?}oQah5B;ee-!EOU5;4C{BGCQx6V-U)TKQ~fQ}2~hjs z{O3L3t%LqknKdtsbq`c_%?oAO1N$cD`ARIKxYB$64(9UxzSjA^lpTrNcfvDNW!*{p z^lL&D7S>R?%zux&PojmS!@}HQ_`Puhbol%BP5Mt$*3i3eGN6#Ls)zP%x}k{8yKW9{ zgFAU7jF2VQ?bCxpDRXhPu?L4u^V8u2aL^(9i(&?BQ}J4?$hyR6h=NzSI0qL+w~F zhroKm;dLj^LEYm&hxp+J6+;~!P&piN?RD2&Ug|tj3$3+IOKpU@4tB}ys~>Y}eQpID zYNw5T8R}T5FK--&I?Ad32h`zC^Y1fWpJX8gcytdzo%&Ga@a;Pu%z%r`Wcd;q#vi}L0#_DkqV)9?pUQzcfxezKck~nL){H^ z6Pz@R`mNv8jR|lP>VX*=afT0I2Ism1>VgL>m(JkE0(yQ^e+zXTOb^009QsZDGt@_5 zdK6BV9{9iY+y{yXF~LB~!x^rD8Jz22sGSGmQm6}H{#xip#tOwiT>^C#*7yCUUIle2 zOs~M{kN>95ErJFw=nI#(pzt^KYf$^abU$1{{+s%3sH0$dIZl87H}yfNo%>oB)Xs<4 zvkw}YlV`6_#ukl2?MSX{jrJ|-f679Pv!2oHm1y))3kk>CAL>4+bFmJCx;*#S`R{4^n0L=T4y2Ic%+k|?uYr4aC!#R&dYl$kt`swuwL0XLm|u%vcWi@J+NK`wGCD<0P7VG!8memcnoH6Tl?z_ z8-7#22=x-D>2Lm~7Eq@+O+WgZ`g^DgoTmR$^qU5AGW5r+Uq`+V)ZI{r;2Pfo^#s(H z`}2cPhe7`|&f&%$$$~oEseTb^=l)v;b>ow;2eCgFRsN=-4(g`$vmelm1vdPqZi2c6 zrdQw_w*01Uhq@i6m*VtKe^Y<)f9oRhH8jM+hzi3O3`3paG$JQ|Q|k}I<-uwCRZ!PC z)j?179O`hGez{Yw`%V2c)KM@!17Bb8 zo4VvTbve}aF#n}aSOmNHO=xI_hReP5-QUy)pg!U>y&GzWQ~eFpGMr*AzkV2MFQ@vy zP`AOS1>-+A^5YCi`L))K1{Y{(4A{3R_79ti;IsCXFkb>{d=@wA6f_l0M>Eh&)cCA! z%%6+qp~hcRMtcETjFzLuU!BJEqAE;k&|0((tw)>CX0!!uL)+0#^a$FG_Mp9J->gNX z9~%bH)94^-LmlWiI)P51WNLPcWYmPZpl+xK>UBvtANgQ|9~yuLp&@82nu?~Q8E7t= zhvuUNXfaxfmK(zRlL~BTL)%ds>Og%i&TgR}8iyvJNoWe1il(EPXf~P$78#u&9~%nL zVzdM;Ma$6&vUqGn85&^ELk?L?2D-DnTmi}s=Y=m2^e9Yk%YW7Z-vjtvv& z6iR0BP@pE%1@%CEP(L&P4MIcEFmwqTb4eI~vDgrYCZI`Z3YvnvLe7d1yXb zfEJ@AhQ|0W#fB=h2CYTw(0a51ZA4qpPIM5pp$?RgIhRJ83mO1k8vkL~5Q`?E>1YO; ziDsj@Xdar67NEsw30jJlqZMS%?D((3h8naMtwZb42DA}vLYvVRv<+=XJJBO(H`+sr zW_L+1HuRzW=m2^e9Yk%Y106>v&?%JA+2zZq33Wl;iZJm&y-*+24-G(r&=52XU4n+A z5oi<|gT|t9Xi)+tNoWe1il(C(XeOGC=AwCMK3aelqa|o5T0UzL{K-4Ji>lBXv=*&H z>(K_Z5p6=7(H67~ZAUxNBWU*}Vf^)ALoeEg_M-#nX><^^p$>E$oj|8h!mtmZCe+2y z82@hA;DLIfKBylWfCix9MoZ9Aw4CAiufT>Xv<9t3>(F|%0c}K^&}Os+ZA079PV@-c&2aqpU_&q3hxVfb z=xKBiwV@7l9GyU?P?I#<2Ru+eG^7ZVa5M%@KvU67G!HFC%h4LN9&JL~&?9IsI)E10 zFquGQd3H_RP#-i1U4llTacByff##wGXenBS*3DW(8nK}T?L>RfesmBWM~O1KqAsWx z8i0nO5oj!$gr;8-#$Prz(I&JRZ9&`6cC-^cf_9@lXfN7__M-#nX%)x+AU4=g2Re>Upi?N(W{;(e znot+i4fR01P#@F}4Jg7S2n|8Q&?RU%8i7WkF=#9rhbEv&XbPH&rlUm}m}H{aXfB$E z=A#8@FT7}l2wP@X}MWh}Z8qh|x32jDO&^ELk?L?2D-DnTmi}s=Y=)fgm z{GG;zLDYsi&~bDEokEET`v7V}T~IgF1NB0EP(MRs{0Cq|5E_Dpp-a$kGy;u6W6)SM z4oyIl&=fQkO$TSkKQ?5d*=QbGfEJ@AXgOMeR-rX$En0`xqYY@I3CDjEHZ-FxXdBv& zcA`hnZnOvOMf=cxbO1e#I?!Uu)z)WK)p~O)DI0n zgU}E(3|)eTqY-En8iU3f8sk3>8xqhYGzCpX)6onx6U|0*(L6LCEkKLW60{V&H2%x6 zp#rT!YtUM>4y{KU&_=WgZAM$rHnbh>M33k={=2cE2kk}s(0+6PJ&g{cHq?QRqZ8;9 zO5i>#;|ot2HCb@{yI_MG>VbNpKBylWfCixeW2lYb(&>%Df4MUfp;b;UJ zg~p(~OM+?wmv;-|h%h3w(()h2!h8naMtwZb4 z2DA}vLYvVRv<+=XJJBO(H`+4?$A2$2^r8Le0D2l7L~W=89Y-fn;)=%}HK8u38|vYT zDO!$Jpp9q~+Kl#~y=WggScHiKokCr%nq6ZL)DI0q z!_gQt70pD8(Q>pNZ9`D2B9IBgkBMb4KZk}Vdb|SPn%1yrL^*oA75*3 zz^bwG#gErlx^?E?8Rj#)YD>^?Gy;u5lh9N&1I@1dwDa1c8mwy3db9~`Lyw@n=m2U% zCr}w~Juy1F8|s4wp-a%HS&K*5bO(Ik9@nHTYiDbgXa{di6DK;$nolGhq@q|0Hstn;4 z%o;Fg#7kX~Q2V88kdgiOq0` z5{Dt|;&H<uVut!xQY>%xo`&de1s7<@Q(_2xr_>PoPI-}$KSFE|$rZ;Vm+e2X6X7 z9(d~q*MZjuOpb#;j2lnU9Dcp%R7Y3&sqvrMet_aqZga{{JLUaOd9PF6?UZ*qy&pp<(*D>n^WHG zlsEpG7oDpAwK!Gllvg?BvZ~Z{X>jOVb&kRra?bFBHRkqBw&-drWl+}&wOFUmrS>N!!fnQ!V zndiF@*MI6{bNdGyXb)qJABh-=!V^#OsK}7%Ul~=!v`T%bYwPgbw~DQB8wNc99n$c=jVUQecO>UD4Km} zy~Jwme3c*dXJvL?=tsR-fn99(qif+At5MGUX*k#-pyM``x-n7Fu7lUT!@+!qq_fI2*Bub%FGL#4DytlXcap)ChRxSJxpz_)`{QiVrLl5kyyJ;xG$lJGOHdIYr|=^EdR|BJGlRL z>P=be825>!R#rX6Ga_l3Tf*4?PL!roSmE5vI>(AJF&#-mCAN1|ghtaYs+Mli$=c_| z^m2MXRgPFl>=B-L7k!=nOPsokmaB4|qhgS6h@&B{b&mW&ewO%sc6z5{;-F}Zqd}CF z(GnBeJ|a?+=>wFuiaih0qf~y&VH)PeD`^~L14F!LB~5U7%dvlmS`(~qIra|m zkX3XqOiLLOr&iG|3{v+&p8Y5dVN+v#`=d0Kb&v6hN9lF6nR}oF>IF$y^z_RKz z!5*WlskGZdc-R^mz)RQAIdr$EUPF&curs_{Mdn)itw#UGtvNJ?nGW*u99TmA0g<$x z{$;MR%|fipg}sml%4|cV@O+7$QPor7I$Er7jZwRWzQliV?A) znp#x)3@@yuuh3kszDYS&#BsdG}ki3A&mntfT8JN)9~Ck|XTz&=E>kixcnC zZ>aJpJjVE_Xnl`<#Aq7dew_M>s=f3DYKBKi&aH%pZdMBKX8H)FtHj0+Xq+Pd)v6_MbeKNtL$45fRJK)?s~kZ?Jj%j$LDoAc zhAr$Zvm#qamL#^$XQyPj(y?_&m4LYyFh;& zdVHHBbVw|{k)=qic#KzsvOtzS##=&J23t19eQ#!&Odb=ZH?tbb!tLCD5xYhCk52mc z@KuXgK07%odKa<#82v{0FJVVymN+bI;cP#X|LizFD0bb(7EpRjbVaay6Kih$pP0KC zrMvjNbI@y!SlMn_u5h>xiH-NNvy6Vi)8bhw%Ni4H@vK?$t{5d3w6*s1&(4xV2_Nai z6+(xYY87RPY#pT^it`V$&lMOzhgY&l6FWT0)itb&Z5kD|Ygi_Qog~x@CMbPR9DIUV zv^js%$=*7h?1tD`Cyr;cD49jrh1XLo$0YyBArJGi=h$(*VXc|8R_WwqhRE8)@+8;3 zv4vLEdjB{HD|Z|ltxAe zW-_%}w7$r0layEC#8D3eLWl=WNi4 z55)PW;NU;Sm({YX=~FzpmaV@ETHGKA1Y2QRuqnnp)6?PA)|tys*0NgpHHSVZ3g2Xh z7|jupb?kXc*NN7@vB^2iZWsICW9KP5H!AkL&z@&AT?98XjY%mwskX9c=~YL$ade8r z#D2DfvBkq8^h4IiSQ4-%EsM3VeNEacJT>^*+IslhK1XwtlVrIu^q?Qmz3TmHuSO- z%1({&?ypz`>l_jKaaO6)C45Ie+atf?@Ea5f-?D{_hKfz!F-fHhMd1(3m9otvV(S_9 zkwyc>3OkFJ*tQ?}_8-|s+Rk5i6Z%`#Io2;TH#jH#!j|jQl^>KOZ_z`gQA%BmBEh&! zzy>VV*{(4j<0`FX8^%PVtMq3^%_4H11m`UGQIUOx)L@ohazqRYuWKYbWieyIbgg98 zlo2yodQf-+3C@>+V8k4xK3xtJ0*mT$DX#Qo2V>XGxMm>jcY{ zl1!|2Sfu1j2~>W;ac)R7J}3Q`>a}Ll{T~x)|Bp$$_q_B^W18!r$a_Khr;G9@SjH=2 zS(zkLxxhH{MZYGcK?diD`q!jZnbp{NdWF=>LhWMdpQSvBZLo>PD(MA<7Kz~PQZni6vx`)K%rxG);2RZx`-g zODP`mM&l}*Gb%kzgT<*)X;NjYY~tj&RIAF*IP@V=ctHxKuJNl(WGO_*%*@RGWZpUj z2l*L?$Dqi(DBZ!KWfC9FNL#4UIC;TD`XG8QnE3rv{+CP2D79v;3ChZI zB#m-^CI_;pQL%!_OBoBWiOsTX{k{B@#kS`F6>V;`-S56H^JlBznYLmRQoiDA-;o!yzaHY--;uwkkMY7r`D$Q$qikoVzT{=^!e*U1#>@7| zhuN_&MB;lgoDWHlSn<9bs;EK#)`u?(q8%E?+Mmjwxdd|raj17!&*PrEOtYb{9_*s5SqSuJw33-Gm z^Gu|BP@K3RM>5uNhI>!T+gRuszGqr4WqS{Z@ELj1oh7yLCGN^S?7YT{+!cTJv&P?c zR{{3&lBRzQe^{`GaNx5iqvo= z$Vj)fh|*h?yV#s#+W$Kx9&g|$Zd0zM|KyXmDL3B!B`op-ulL!#&&NFSWX;{C1LV+g zr?xk+t;4D98*JP2C0`SvJfgy+=>KT^;|OK5I-rrV0gc~#yAsX7*Kb$0&WAVmhITKA zP{@yer;(2D_~J-q-*u3-exZ>)5GjyHLxe%-5ECagGJKM&cPJk%>d{CYlqbPrFashH zq7>?I&;`Qufw2i!uQN8mr-z@rL&;%*nPN?p5=~tP)@r01q5-0ME#G^mvPsW|JPE>k zEnga~Y;xbUQzc0d`=8K=6@)$75v?4jtONddj8QTubG27nl?H!Lm_daod>r~T#L+bx zDS^m?kRgU26J0ReC@Z$}qjxJIZkcx7pDdses&%e4-Lb~b^;qZ*4jYe*RSH>?jUS9v zB3XrvPsb|09>un|MqQr2)8=Ujv}@1b`$M{8qm75%qlD>hkHOAMhds4gWZk1oQavwC zBWob4w~L&66R=7&Jvq|59*p zu>JpU+UA4Z?L7W|Wxe?`I48ht@@M?;{mLD*o2v=REmyre+n18Ua_WfFEEC+GqU3n2b~F#UTzbV) z@7ehqhj{Kv1^!GI)hnUvvGV))iAP}YMZ|HIsvKJMz6KvN$)%5(Grx2m8gDXcignwe zf&WuCAL?xq=Tenuv!1*hdQP-P4n~W}waRA3_TC{nGL>oOwc-xwClFb;Lx+TLGoqPh z^0e0!A2Gg8iJarR z=ak>mxA~Uml!IzTs7AJiiWQrbX{v9!2}U7ACR`Q@lx*s=WrSKw&fe_fb%@@u^6b`1 z7+MfP5Y5~8i5HX;s^uU#4W{hY#Ag$ zUWXJv(U%e2YCPM%6|C%h*wo8F)VGEpQwPovCPhW|EvW2 z1lU)cf8dt(bFvhD&h4pUtMcOnbIJg<@DSH+epB~xW~n{aGb4$%28YNQ2X;At3^;6{k0FY=CBm>-(zih0(X z$_;b_-}a{R_`D4o@hFCqd=cy#$WIgrb%$~rz2+(SOag5o5DAb+Lf8voIz&UENPA29 zh0c2cKB=tN$Y6m+jzDZL5NrPi!;h}wOW#(K**P2E`nF5AiJ@!i8+5W6zNAXoZfc9k;=-u&Ml& zkCd07eea-fZBwkq%r5XX*#RZ(n*Om;>-}dVVLm&Y7W*OO2jQ$XqLS)iReX6s*{s7m zB_DxPSrS|=zEj~}fz#SuO%*r$qxQY&}uQXZ#w@eN(dgTUcS z2=9L@H4<|f;mt>t2k4zb{am@gXe4LHlya98hwqSP9fCjZ>uwj@k13~^oC1Hk^5U%fdQ>=$D-Nc&Ti`kavBx5Yzc#My+q&U`bEi1iuaqfj_Fl_9b!iTqJB;&39zq#|64F zL=4113*RrR8<^kQJXldX^oj2javCDxTX9}d@1SNg3=4SEot`36Q(tGROt2CV!4Mu0 z4h=d5L>ELoL@7i*L=r>B~m1`FoFc!qpGL>%O6 zC5=Qu9Cw3dKx9JfVlYS{3L(13R3bHusr-Id)v8d~in)BdtNPaUkfuPyKtw?FyTV3+ zC#O}?4>3XD-=QHQZLT`4sPTVMNlukmGhc;o_sP6J+!MPM5(*Ip z;SI5*n|EEM?qK78;#uD6UN+FheXmvvS&fbFxLQ4H9#=^;L`av|>Z5v67BS3sT?0$* z_!nR1tLDfLI$Ge*YF`z;lH52f2Ch|IDYV&#_&GoIc{ZYo_1CHIQ#Iye_-Fq<7J&=Y zM_g0~^hSp&Ub$HvHLK?v6*AB$Hr=ZJp6R_R*>FlFv8Po2(QWE}`UT$*p&nMZ|3e`a z|KO3gtEbf328CD~L|~-q%iJGu3=iIEyVA1%Y|oHPA8?!=6vB0E}bfWB$x zo_DF+)KzO$vMNLDxl28zu)V2b+r4TJ^GY5u{;GJ;N+YhGUX1@$g)_KghM!9 z7XEA0X(r$A2p{ADPpF4kFMJR9gc`?IjPYe_)l&GLly|OG&%u{$yyXvSLP~3iLUuz$ zLIgrgzod|D5L;v5v;z?WQLtGdhhKz=y(J5@4W1cwe{Cqym8Tch)zH`%o}93dG@JHD27$Wn+SYZTH2(FalX81LAkW_dS0rjTP0y%7GX3aJM3AqpSk)>qV}Z2M!P@D(+X zxqCpP3URfVo_Tiv^h{YA-~Xz*N)1`9kl@wavrOHppGkub4bhgy_mruV?8KvD_iL&* zbIoONV^nPY8|;Tm-)z;Zk4rRLL^r6+ ztX739WJ5S#@xHo_-YfRM4?V@R?e_|aTLB#wqU>(CuHFJCyjysdP&cU2ixuL!n4c2r z3bkVqTyz%k#Vu+p%UQ(jEpQ^N4dXjMgcF@#m{{7X+L<1BBfP;)aPql{haOO^EbJz} z@qk*U9tnXJ4H4==)k;~_jUxV#8mjn`V1+a-oQ=dF*h>)U5W|7ce**dTPIae#H~`)j z>c&7}{Y0Hq19w7E1F;__j6+m=K}Wj|PB?zhvmsVprI2Jlv9??7k|I;Dg>wcp7Wyir z8S>2#^$>GhW@e1Gx@#126k;djJrF4nC!s#=qmaaFcuo)W7IuxW_rRGzcX-1k;cA7X zdGp~f)s1Ycw^-Aw##6TDD!%tCwT#7IB_fZjN8lT#QEuv26Wq_tg?)J?^kWwo1Riiv zxk6<1t2;Ed)P?J()m?g`1y&d$$Rb)#t9I(1*a1D#tdI_f?GWi^-g-ves8&O6HH)QZ z)nTdzFxVe+qQDpk2GOc0q#Pn2Vn9~l03=6y{ zj*hD;{HnKC9RFFppV8}i;Dj2_N+&-{Wt6E{6Q* zAzA!pehIx`@pC&6H7VGC)lgZ_hAk_A@EsD%E~>{Qwfvk+TF-HJs-4nz?}BS3M8PgT zO|{*w(+-&|`B5f=;HmHVex^mT0S7wa5EI2cF~?y z=E8aJEZ;OoTgtYM@V#@ic((K`XRcbM*;OSiXJoSb4Bz3ZyAYZ83L zsqmQv+5^n1geL}QkF!CAivaCD=KTd%uh(kTiLq!0{B7EBy%wgPgxt^0PhPJ@v&kOe zAE^0LbbSy0c#GJPfb>R3Wy$g?39W1W4L>*AI1bkLx)TnA(A0fh#-hih~57bY^in&^E?it@q zNW5KpmeB*e>kiF_<=Xk+9a=JLu#2cD%}nX%e8rtwx!awNP4LH|Z2bnliHdS;80BZ~ z)HZtD=~y}xWxdCmkvZS`?U20SPWaK8=SFJ}(4+80p$2~F6C2{(sa~ z%eU6UX}AJL2Sg1-2gJG8;fw-N0?}FyLj*$o6C5arZpd9hzjEICq&8-L5Bds3>EFcO z9PL^2O>e?=0A?A0Ywf=83As=>%_KAT8rZ9HWqF*?{)dJeqDqEZt_VfhW+TU^CQKYuWN_> z*J<%~K3=S?F%E6Yi&~TF>0T(4Pu_)3EZ<(7sm(L?1zi1}=DwxcDyEO{O{JPYn-~^5 zO0{XT@2;m|Os$vU!<o#q~>F>BLGh1{xUi02jg#!2t28&b6oSyaC(9WhXJ*-Q!E?Nmdec5CN_*| zTjoT!!HuJnIvgc110oM19pWgoMS{K%{t&jGCDICQ(~##Vu&HoE>Kcf(4!CY;t|r~( zdx6k>ejEW8OOp?lemc_GI znr>AX`CjbxF-^-p<)g}vWR1(Rnd=tZUiagb@N4ZGdvAG(R4#TDj*9NO%W`2j1N9)%3wRkM0}bl zn6ll&V$*8V6?D!%nPl&U-U^|<&$p+W9)weYJ>8UM^7VK@A}4Dkatgu@aRjagwGabv z!}BT5v;JtRW}$_GWtrfXA@-_>UuW{6a;RhTpvcKKZB*fFM{#C@>3^k#buu})9lFnL za3U&_$Z;-_VYr(*=uH?ITP3m>f6b*E`5_1qN?%m1T9E`(50^%8u9fl<+#CKBH^`CC}X z2;aEJloLHZ>}I{n`t3wU@sS}yKR!#s;EQg*7tU@;gXabQ2wN5|;}o%d+s(5^=yr7My|`E-6TA}G!^Q}R+uBh6K-UOP4Jxy zH?A*$-FX$P8br!X67hj(gw5)O)pkJax4^E};ZA(WjVOmuA+`rg#KkNT4~S4*41a5K zl`KA8u>KH9OybgiOwXz9ff8vC6r2BJ@|OZD4omRC9rge;mR=7>?h}cmNfHTw{O@AT2qWOyQetgVm=Oc%S?-dbIl!!-UW+hGq591in96U^Je32h|SlU zcWaD&Dnf5CceuDlwlUK2Ej;=F;r6YVzQugZwWzii9_oPD@E#-eo$z{yrC-7W8(+ap z5HiF#Ouu6A;!HCb{-s28n7-#2BXGA8nS^IAI=^6~4q^?29qMB6FqjKwK%_#PJ_OHj zKwAVvB*fw#SpMhi4l)T3aZJJk9VZ}SU_$mk;2{k|h`i6?$gh=1Y&R_6-;C&>;|VzG z;2SqwoZ0zrv30q5N@lJjqCD38sZ#VIjJ_kVCoUzP&4@^Q*t}mVs)9K}ASOR#q#EJ`#I6L1 z!2M>#qZ!@?;^aO?jzH*;uYpK^9rg)mf|!PO2gD%6emIMzLR&PL0*=kSI1{`Vc0R<( zov_L8Gm>+kMEtB1zx*^sf?gP($15K(4|#gSi|hU-k!+Zu@qIWGz(23Qr-^S%HBYnS z?~84Zns2t;z5~VwM9@7F@q$hAgy>lb!vMne9wXb};8ej`ul5}{x{ZvSNs>r4KJS-Hs6`tQI zW2C|#9!A<|xjzW55f2t2g~p8t6ip8G0=qw$K^@v8Y3_4cP>$3uJI%Z#+Zd^wOu z!93;Qo)TC-Jj66m47(H_Dry58k{NM@2aWt7svZ)@D$K*owE-qfCo!@SVsDY?sx<4A zoqmuXuQFHCXL;6N%+H#ih2i@G^squc{1zIzZiZzgSak-bDk$?}xq`5689u zx)Mw%ggh1Ur4aFuFNW}n7oj!g967Y`K1Rx(XJiXRHY{ls=o<%X1}5DL2kE0^U7GY;7=a(bC+YVe6Byqab=0b1Zzw zghn-1*gr6PQnd#bIR2>6g?W>-tm-k?`w+VpF>)RrW*ZEFHB4uu8IDjtv`;}KKwI?9 za5y2xZ)7BSHJsLN6bD<)XPAF2%v=s}egyi>LO7Q}z7^iq4pABe2Vk{0e89Y&dPl;m zdTxLjVS+p4jzH*a5WRt7(;@R&HYWnkSunjem679*h@1}dFqO`nX5x`f^HcEaMv6H4 zi8)L1Z+jSa0fZGIz=siUh~Z@D)KT!}u)MN|#Hvq?K6&mMv9`Z9XUYwuei6>^;L!(QlzPFc!A{r(vFB0<&mQx&Y$2?s ze-vEaA*SF`zV=@DpatEvcmlS01zd$Z7}*Xzw)Q@ecFb&}vX>)jkjM9$6Cv{$ z9I=P$SIX23mw5dOxIa{!FzeN34>B5*Ww9D7a2tGeurA{Y$9Wqsxk9gFyKExRL*K8$ z4dwi#x9%rj;fNg)>}q|hS2qXTniLkJ2ea)qo*JWP zvkV*Wf~hPCZs3j4;cIS~ewQBaWwOEZF1Zxm%FV%S#qejyosii_c=cU+uy@Hw??lU@ zMVZGZ+$y9!Urz+&7F)ft+#LNQ;>2D0A0*l-qVLi7Fxky<5N;yASMQZsl3jEq=mBtZ zHT-0rsKc$a)g!zqS@+}x$@*gW>cl7#SS&VrJ*-DEA1rbn)|avI;n~^SAJ%K&Mql2Z z0k>RHd{^qOX13uu z5xPc?mRQy^V#5>q37H*TFXFRwO)>wCk)7+{4784?t=Ik7nFFGDz5cF5YeneOI{f`# z#eJXGPqF(B309y_QPwh6U8sAq#xeNuN6%mrWALMozJ#U0jT42sLuSxhcDb+FRAo7QTP{qy@@^}>~HFe=d!#|;s36FUS$nI@BoT_n@NFF(5XbYGhTm> z(gdL&)FUmj$>BfD<381WA#)$*IiKn!iz9}~8=rpY1;3D*97`ep>xc68d#ou|>qJ-x zsf5Q*hv@cOt(zyx33+QdKj*ua~?1Fw+iv5lXX-a=pVr-0W zozaU~yZtY2mSDEq{+9?#H)}c}bZWU$qnhwHS-dqCJjNsETD(1d#`Nvi!QYbaBz-ec z5f)~x&6FJOV+AM`kC&fSa`f4Go13Lo-`=KM$#tq@SDPrFXBnij z>^KI`O!VmwJU)VXgbI%9fcs@LV?lL_yGkHDy0S>o}w?OPVEROnFI^o-oQC{V5*(pno zu0g(Rfn^Pww22)HEH_X#O8LJ6EXUXoZ>OZnS@mNN`|?FNfCJ440)++cZ=sjl9X=-y&Qu;nle zuO{vuV%h3>@gP~S?y=)%U4N!zdA7+)uah1BUt{MVS7%v1{^uOdwlmIQp3O7ENeUYp zDchJbqlu0w5-Oj>A|s<>ii(BC92MKh=eEf36BSGJknyR5!**n6aMp(!6%|toDpD*8 zN=htB#-U;b?%6>3zOUzg$kq4z$CuaZ+WWq)``2~y$TP z)bg+W3R*(q(Rf1AfR)p9meTUWwfCGcE6&_CV3odD_rR^Q#_H?tUo}5o_j*-gYP025 zX>Yx&HX;4N4;hyZSlLvP)#Q#R+%S65e`G^EA@P>@pAHP2G>(i~JqLew%1Br>KeL~3 z+Q(`Dsf7Do58UAYi&kLR{7_i1&2Q7D>X{6p# zn^^1FM(g@0qORv|UC9NHCdU)JH^u+7Z$sQmWvhLjh1LJOb$02U-h~xe?F(MMsj!q2 zjC(yd##gTxr;_KAljCl3B&_Dmih*m7KL4V$e7Io$1IA^yTs&%(wz*FQJijeSj6dgJ zUqDm$jGES18f}fe$orjJr`K)xSZ#9L@Rrx!6nCx0>$O2EkU3xtyNuMEr(OG>sbv6; ziGu@Hz_st*Rr3vd2CTttl6%w|-X?j;m6lN_`CV5V@q}G`_4Hm!zG&1Mbd8=&*|aYC zhWNzZt2-m~HuNZINM|yv2P_0SWRgMt@X3gnp>wgY-qMp zvj(i%V>eH`>zXZ3tl3Hr4_KaU33UV3TzJ*I+CN~;#F~ha{ur=~hku95&6d$LU^N_D z7x%hHnJYKMS0@Zuvua&D(cNUFhHr?QJ|@o8fdOku{hSO5nHC@hh)w(pr`mt;>id#1n5AwJO8wRu5Pu zSjVw?V~L+0uts9;c;fY=C$Bp~+tQ@U4RJqHbmo9A-b;T>)VUv`31KGYk?)mD^#`r= z)Hl~m!->(8IY$m!W@^@Q=2i2Pb+VuatT8oemE&VAZ5mbywC9@LO;$#hRC~}$re7M; z4&1$o3jDFrlUI{tC~Nd&_NIfDw@>=$Y^Cm_-o4U4->*DqWrS%8ZYNxSwp{l~dDtKK z6|FLvXwp$T?vsqS#Q%6eqNN?uF5QI_v@el3pHGU@w#mKpSK6C-CLbI)xxtC+{ePtI zfAE?ImK!Rik6I-R@t`u#R?^W$qbIMF3irNQ;DqDExb1|@|L@`8$Lz5mx7$U39cZ>P z!=ooRFgyCgFH!8bm{@nkeF-?TY2$$e=2b6qg39la8KYhQx8X9WcH+T~+#?rNkuNfYr?m*l>g;X+)N@BVPruVLz@7>St zzKNNz&BxRlJ((%J$sp?0z#==L%Nb*3^&Go(IweK+k^!r*?bhjwE*h}%j+vZBCn%^+ z77EMHUgI(i*_+S{S=o7Qw@mwKxrBR4FnWRsB+m*>hT z+n6q?l-%RE`7DRikd)NcYGuq`WU{)8wrOv2kI~hPCEKv=Wq#Xo*Rd6E&|Mc1$2NJc zCPp^vi`>(R?p5>c`xB;p2|qsR-6Vq|so#tXJ8_B2aTiIvq+d1Pdcc$detgm|)8{`j zPC6Oa*cqi|=T5!U?SXAFf`Kf0C>F^6@Qvo&B}bWfkCO6Ht1A1Fcizv|e!_@9vcibh zd;aS}e|#l3fzmYBHU0}-+uiZ!5}9o#9VnwS?mAT4Y;BB5%Ws}G>RhXhR3_Kj0V~2` z)Wa!P4M(akXLF9Szt^8Klt8L<AXIGD&y!=saersDTPhfdKSWw$Ga&Wc=-o9UslRgTQd|7S#?q()L zcib~zg$3~mSw0jko(jKcd<3}GM42^VkDakM{+!oBB%H1;s&czEKgL&$+*|s=SOnfOPbZI zb*VFUUMclWcV}#OrSIsgN7tEj7KPW!%}hE-S}zeXrrmC)9C7=kA7pWoX@8o%^wwJ* zCRM^}x!aRM&PH17_O@F7RI!tCuf+LJ1^j34l`T7nvwG1=?8IJUV`AL6S%~&fJwbZnW zR!ct(SjpR@ujn6AjdMbbl0t6TqgJYGd573FooMcnV?nZWEFcXOFkoeFqg8wJrd`#n zTh>P^%dX$$ia!gJ+r&xc1PiZa4}0Z)gDTxrnc@<6>2BG`1Gv!U&RaF#Oc_IMSl0UT zWWyGhM3bHN)>(5Y;LCI84Or=IveEqfq?3f8*w zSd1Bta`*@rJv4c{-~c}8}Q z@G)uos1>%`epjvZSz9f3dpA*Ru@&@FAJf}qKVx0a)-HQNz%QH8sAZ0)$uctHzFn*4 zW1RAX+t#g??rl%=Y-a^*lXH=A<`Kfjs*c+aP7KZgd3e(2zI5uU`QiVo&m`wx*y)!? zn;!Z1=ceqD8=p1lySVY(#tmm1 zNAJTt2au`qHqN-3QA<6{L$>k5s(EJ1%D9(FS$eRa)+TNowPJ0n69+n2qv)&iHqeeeA?n^W6)^_c`J1YxlDSKQ9OuJX63_ zyK#5^hPWqD9=>mhukgvkHqdA!~s1 zZLjR{BZw@}H}`My?O9*VWIX?|YRNAb(KmOxp*@@Z?{n;9X+6%S%K+adSCnABTu@x* zbywu$&$jR4XZZ;h)NQ?|Jm+y+U3x;v!B5q=7vDkL{5qw7hk=uBH`h z^~_$Lx&G>OzxZ5KEj}|`Y4_YH6SbjweSVs_zxLa>pLZKwz55xiD2j&k;b!G7Ch%?k zxWTSBxcg3GN_VZ~y*18%(i49!0q0B#D(-uX=>Tcf(B7ZeV@_{ST&aD3_s9E;Tc@98 zHaAhQ-F9|^rro^pI8lEgV}8UF_pc1fV6jwNPb@d*H|Y7({u-Lds!#O#tGT9ax+(7C z%}Q&UYdZ~TWBkjGU9g(_-KH;9Q~f7pr}mzZojTpW+Tf?1TU!t;aCNLGxVYes)7jHThWjPY*IZY)hF8X_f3UK{ z*pyL_R&c$QeCoSf_K(*7IA`s~&R~WJD^za>r?7+FfA$ue!--*_Z#u>E~XrjQsudjlV)CnrPMI$*`d|IP>EyrjCwy<%@vO_Z@=3lw8DCoJwG89=6TIU* z%@-D4GmI65-f!3D-*Ni2DGx6$<+r=tv-k4rwd2H%9RGBo@5^5Id+#~E<~HBOZuj$d zAK&s`-y<$}>ha^>De>{!6PH$me)c}!?e6m`LK|=Q?RU>t9RK?5zBO)lO+~1)l$W+2 zt_ZFBfbW~`?26;x`+$#+kaD|@ulu0y36J|*cOLKfi0}GUiLaEu9{1*LKKXL_@z?J0 zee^stt9pK^_Fa7iM&CF7eBNy1#O%laUC;iH_^JPhZ~TvVW6!_m_x^_-_;=l$i2g?b zng5<(BI7^wl1hFRpyg3@BC7S+x2iPw&s|@FWY^}_cxDsM@S4^ zJ41zaz6#U-z`Pkd>{2t;FcwpuJ+RB87HeQff=d3xyy@cQeE^ER8FpoaRIZ60aj8n- z5x2@x^X3unG1UzV&n}r$4`Zji^v?*xs@-NxMTK4IYNCGLJn9aaN_`H)Gb>eCNQOb1 z9cs?eEhrAm+IpKRc|;1pkV;iF)&2E(vxfvzC+5vwnEf2~&Q+=xioNz{k};-g zpg2}#>tW^px!BuO9u#|ytv{t|Z5~o#n;mKbK0{9>56zo3CIvPVh;_MD<_q&?HG1l= ziGcMlNr4?|$Yw<4j-y8sRI!jDEBXWMOi**eqaIcHGCbl@gD{W$jekPN(FQ0EDO*3J zk|)L9p~|6o6i*iaECsk#^Izu8AQ{W1^cZKwu|Gq>LJF#YQc#)b(G_aev0HG;S~9&e zI$c1>5H*~FiM4JWu675~ivi0$n?|sG5H|T?(bbu{mK>jlDK+ zHd8^)d9RT^pk|;rG6@^7R|H-&8)3^jZEu3o#e=Zx@{kHAYkLcnxsZIJl;c*B3r!sn z7DFte`k@ru3%hbdswzY46|l<_Qr&OYdKAiBslP(&buiYoMpbUndO3`BWvTwHTJMFi zSf|R~ru9rH4a_p_Kt$ERvC~UtV24iNg_6K;GphPv*Oeg^eYcKphq118D&;*|55lfw z9KK!a6|jpQ$S;+0!m1QX!-{SFkg9fcvr-IQewBZRPLK;_NQOVC^+8yKW8HhT9)*SI z(a&gm^H;RJ5sHJcR$bu;6bBZ+jvjTXvan890>$x$r|fuBr97=8g3z91P&Sf&*mYS* z^&ipky-@7qQLT@_SXYWtom#JjvBPO<@mpG-gEF~OyS1JSBd;x)li$($IE)=O*Q@;R zYC|rJ#hzACM{kE+I}O$RLml4;V_g@ku^(xD7|P^q>?0jrpE{`R$uQRCRXIP?dKQ#@ zq3q{cFM-mKlp%EaI}AYrGFuD9fSw1tR)$pNFQg#9D*2VR7s1%!6)Jaxc+wY-N_@8R zzO3U7DEq>YqxakTDwXnzjt|1v;kT-(Nv%&nJUgV)|EvuuPENOt~y7r5k0zA6`j=a`OqHgxNwUK{uLfcRn1Tu+6bxO?aDYsMCUd&^A8;{ z38m$!uW3CQ%6eUN^f}S#`t-AUj8hhLx*+TdgjDwHTF->iBfXB^EjkV4*9&Y1_&e-m z$POvL|FmuJKq(-ar1f?v1&pnMw0s1L=R=O3a-PYbE{DDrcRJ&w{bo zH`R=zPm0bCI%J+N5fL>3rQk7JkE+6;*kh^~ihWscvF%Y+vYz;^l`49Hj&FzJfO?D8 zYhmoLPt_%By#~r89C!2)(OHJ3ks^T&%DYh~G@u*~wXpG@OJ>Ry+8%@r=)uie_d^+i zL7Q!APRNcI&eriQuzntg}Xr=ZOG)*S5VAys#cZ8wb?0um9-)dddQjH%RXPn+pf zXcg%KW?}5`Dpm4s3fK@*)_WW~l%C3auQ;$$^}bKryP-_(5l0^q{qXs!r&PzsU{}x# zsiqHVLj&yl$C6o8q4j*&wR5GaxJT<{Q07A2y;`qc~TV)i=saAVf$Z`|X zg+&ozOqB_{E>PZ1;Tn|@6RuPl$Kc^4Rs3yjA3HAetJd#|-KD(W6Fs2%esJ2XJ;fpS zymrV7?Sk_N#13y#mA{0sb``V9a)b7AgW_n;gv4`+gk9@Gs%%o*jajK+jmr8b`bH+# z8>j6ZwH?Neu2DTp5+6`8w`F>8VECPu$#)sd_FU7lTV8UlWmi-L#q&vnEE29&PPbo6vvKcuQmC!!U~ z8PI3z5hjOZXj7g~3L`4jW``=Yc}O+djH+2E6;C<#(ogC1^)_Rw-DXrx*gT|ywT>e; z4=J`=2dpHhYUv_oZ3W_9I$T~Rqaywj(Oz9@W)nhR_D43wdZwCNrxdP>{# zp$u8O&9EA>*`~4%*^ZbhMxg!gtaIDYrfS=Dg(*+#0)kM+zApwD^Q>=cdj^yqs&w>n z(OC_tJ&?ID^?lJdsj+9t$MLL&ARABPZ*=~8$obE4Iwl!5D&uz~U|G~n=nVc>VM<8V z|4nD8gR)cBp2EI`OXa-iZdLS}(5o^}3lph_c!o0bjOZ)Xl#o?0d{&n;2;~W?Ht~#J z|J4L!ea{HVknBHW?*~=upf{w_FA)10_HJQ7Wu=gjG0%W9H}Y+=JZ-kAn4|YO@gp`n zRr6a(&l6FmeY9)=Q*7mY2;A)$8azHV>(>R2?5k6Sk>Bn>-iTY*#58 z#s0LKu-T?6HtF<@P#mte^`}*Sy0%wBy5@Y9f2lT9*$k@&M{ltiQGHNW$0C%oqWCgN zA5mpCaZJciq-H|4CRH-CPR6QSrJfx}* zoiU@>Gw#z4mck=DvsCs+iKoXZge=FIkHKf$Do_PGR;aA|b@~!04XChb(&cs_tY(Bf zwKhH=30MWNn2KVK9{9M}xtiIGs9GT{O|F)FSE}MVov&Po!+8&BJs--Lw>o-Mh+}<1 zlVvn00oLW16EO?zJzex9rAnPiDjb^T@!ZxsR0AADZ*<}{BPxn66$X`_ba_z9%YiaS zDjud>_L=m0S$=p@NkE>_!k>e5ar+}ekLvwAq^rh-q$_WL%v$3Mkd`+J8M0X+J(k`m zd`i{XjHoFg@l{_GJIh+g`+y0XZk7EdaexF(1jNGzC`+rSBgzsoG_6e%-==zO za{oUldWR~r8BtA+-ue|?UNelv4y*CUCI1=~X@NY@Bp=f8fkV=D^3LaJVX~S!tRp6% zI5OktlaB6>@U_O1OJ*{xgz2yfX2EKh3#IFuq2%wi8CCvg(COJiD9d|Nbb2&qc1Xew zRe$7+nM(nUP+FLORCq}B+2q{sJY!}MKh=$$2aJjDNPN4hIDW=7>fNdaO8UVbZO{C! z@F~>~rNYwhoiXR8mdq+>K?UWNj5#X-Jhg0c(fGdBv*0)ha~!<{j-i)2`k2j#^7oQH zoTv(gjBzcL9%#4qi0XmjK=22|A4*WwLR#J;jCGi5#4&h(h=I3TIYN4%MaU%c|44KS z7IOa|KcQ!F#yO0LX z!XqXHW&B(h5F8R7Qsp+EQq4Bm!)bu$@C0Me)f8s zylAx9rh+dz>mJ&34N3#%AO(2U;)q0qRqC&$Kz1e}dwbz;^wu1Ml5t3QG*NlSM31Qy zCQ!4X!@S7gxeMvjA_{V z%k>GWCa!xlbq2fikSPJ)NLKw-GI&+W-(mD3H7x90&8t$~Wi?P7sD;gcX8k&PofF@0 z>tQuvvqM=>TA1=r@^LD*2+7|pWbeG}LYZ_eme@O# zI;-QuLeft_sVIFx>y=Q3WI}X$B7afGw?oO_`nryfLh3P9k7JmCQc&_6I%A~}$Hs*W zNzanDXBubiid%(LU_r^3>N;yXS`DS5L7Nek%%A$R_X8+)PlDEikoW&g20IW{&9HEi zH=VZLp;A3MVK$T=s9bT@z5xYS>Vk5hR2+8n;#FejVgaSXMq7`naYr}2I(;FO_y)-N z&nf3b1Xg2U`E~msd-D{OgnZG(?EMpm4wMSgq1Y=Oz0uKQjy`TPrZgjJPISiFzeea) zEdfZ+PCyEb*#YUA;99AGS1(X1$g_1`PTOQhbo3b^LsGI%(nr!x=0dwRACBa){Y6eFnHc7@i+#P*z;>erv;f8 zi~T8;FGL>_(t{0`h@FPn46Ec+ZBNF-`<8kywo7@MsNPJ9{+U!t`Q1XpHDKBfQ#CND-n-SG4WVsFsnFE1sI-OZc zKsKP9Dp(>2kD_%XdlaY zx~0Xtu)kcdT7=B<`s*P*HVma_gExpCRn_pQod1LQI%CDnI-=y3v!;g@rWc7GQ{~V= zS5OLQhl?*SnSFMAROS8`=~!N6@0Ro(D)UxdP9K!?*$k??s-4y*vRsst`~k;fC>u&;nXahSCNH5rqzh_=GKft&c&8Pv4`@l63PEI-(kihnk%#?^B|;sVX63*94{FMbT+t zajl-kl~CgA9lcR>D#+fa<8z@n*d!!huaNX3j@>NZkKstV8ayBg{K}})2?9_mEEb&t zD{XR>a`gO9pEbk3Uoyv_^o#{1ea%B!4?~%3?b5@1dS0j?k9uJtLo;DBYO2hKG2l@S zl!7Wm$5SO_2nHQ}Qpj_B;b$OYHzni>R{dE>{yHI!HadEV-#4p2;2|z+`G_fb4{0LZ07aUle-OkXJCVCnen~H39wOyjit%cG$1!e95iw$OUQ`njL&~qWyn#*Y`qc zS^C!@J6tOqdx_WOPod*T(ILpYoSJq?*Qv&YM^~zI<>-r!Uh@01W+MevL1}UBA0aoV9wA%%l#mx3$$u6*ujQb;#xwqc z&U;03P6A|1o)UW0h>(cNlcKLwkvOD4V+Jw@VnVvQ=&xtZR4Q(QG8e*s6Y>lQrGoaq zi_Y`Ekn}mTkS_O}5_(k1KZI*c6`UtP3m2iZIOR2TmQS^i41P=E52*&>(Tmljkk@TJ zuM5-Fkb430VN=3_-GdWQ9I5aKJ5-af>vENyC|s$=ggBh;gR=gc?Er^?kXNMbYb8Nc z1=k6Axg=yZ2hN8_Q&bZaN4ud6SzZu3Jrfqn^?E&Ix%LWaY4Tfix>6|n%Y={~PrVTG ziG!-RSVwp-k%)CFAY&_X2@iV-XgkR_52s4;{11ueWi*Csju)>aUilnl@st% zP(PG0Zz_f?uSOxOq1o1XQwrtoHf!q<<-Lt~DykCVK)sNPa^3^!V*h)^&bMD|cBOTw_nk@4p1Xx~GLcVrmmFuxD-Hp!B)Z2_I<4&i`p*T2V zGp2@xa+A4B>@hVbq-C`g5XWYOjD6MJkcy3a7R(OUw^S>X@=gD}1R__dJ|P}gSLzlt z!Pt>RW!wk(bZqc`iSJa=kBfe_YX2lW>Q#|XiJquNY9V*M(tVJgsDO44icYzK{gTd5 z6GAG^JOGbuFjWJAjNkBCER+m`5`m`^b&{}CEq)p@S&AQqvdn}OnEV+bFQIHQ_Ck8F z^by@t<4`Ip{k$-u>Vyn+MdJdWxbPb8OABW2ugxWM=AcAy+C8Qd_@5ARZx0FmYOX~W zSoBrVGnD#adA)XpNka&-fmrO&p-l;Obs~sAH(i6t_bwycF#=Hf}#+2VH@jNU30CISxKd1Fz zzv!7N_6u2lOupdr5)oF#Hu>&{kV!T4f-cB2%znT(p(bCIjO$c(Tl>>1TvI zE1rc^7#0#=wFudmdR`aZue?h_CZ$cUinEZRL3{eX#q z0t)Yt2o9YONW0##RpsckuyVJqF#RrVpMo;RITefcB(8$7qZ?Uv zV&}!iJ&^Kxh2+n>S9HJXHa|+BbGs`17-ZHrK&dEJB?)=tx*u|mmpuUG_!Y8aP6^q7 zT0X8F$*T2JhcPT^~Kjne7gz@ zX?bqJ>v~Q=$sfK+n5||%EKF8?yQvTd3hzW`sK$jH_cixweOSl`{VIH)L}aLvJvxC^ zE6i0>4Z?I))CuLv6%+k(Rrg0B*X}<C4OqVq>riOt93c8LRvcb4q-sm zzEikHWnTm7;X#vtWb6|IUD=uo`5s^OwU93i4h#9ztN%LD+ts3wZ@Kud7d@uDP#l`L zL6)mgdbI|py6trI$en1N7P#JgVE}wx?V0xLh=X^*QQ`u1FKogXT8ep+FUM}$w z71*uotAt%%lMjLJ)EO%7(gnpTgq>>Q9;ancGJ5ZQ!wgDGU@}aHQb7t7M`~;yQbSOB zth*9Bb7TU}{ero4pRPC=N<-2=_J%oS5>ZV+3{_RSpzs5_;1(!7(F3IiN9Zcx>B^xDop-NJ7pxKTEcgkT3#_V7NqRm_hvN7wv?~bKNl8Md=R<`Tl@Szv}(0=w3Bm59ta2=X8A4Bf8uQXgd%i zAnQBY0O^T_M|HxiFX)6BP{wo^iX%eNYrd${_t@n7Kwr}7OP~zNtWDm2e_8j?1e8;@ zqzU41-$C)5TWr-=bb*Udy1eKyDS+nyC|wnC^gbx@v$o!$vYWMD2_;=Q6bCCDd$+B3 zsBtI`=RK~|YSYZ8#IYt-0?=x|qrlvey^cIy-5PcBB@{WUTw1PN!`0egH~O4MTdwR2koA zdHjqoh5k?yb}G-0Ajjp9kd`#`L3*U;Sz%C({aANp=1(M^-(rE%we7Hna(ZlgSoH}L zo>DU~fice>kn}iglMf=H%i#OB%}zuV%Ggdod&m2!WZ*3q6n*07x&Wb6m^q|dk`CqW zsD!MdtQR0(WXKoFkp9vs?^il~7L;=PehqmTQ#qr0O#7g8*~plX+4(z3$ou`5Aj@Y~ z$fPVC7xJzccJ1`5jF-{L*Yv7(Yzj)g!6{*DODdEh$bxo@bK*Lp$R=OA zh0?T}Fdf{PS-DY2?^-L%Zb#HJggfaxfZ`JxBj1KU-W@%b)gz`0Bv-?sf13bL* zd{uje)~jLc@Mb5tcC$Rt#x#vR9G!KU8=5w+<>Axg)SAy)B1#vAsdI| zKjHU$L}yimuNMZDkuT(sv*}g!?~-y^|8)f9n~5VrcDV7Im&^zS)V*6W@Zs~VLhkLi zNxBYhLPCDw0Lmmwd9TF#)x`U>-haDxxEEGYP9KyW8MMiqvdIg|_hTnURTGdFR*HdH z+AYK$gYwv2R4V!*H7TS8^>^s>bx_g?160%qWdobD?F_*OiDyz*2uY{O0}i^>*A49q zLj>ft+9H(M+fb$xC}>YUo4h_3vVJQ*BzC^_Dx`wK4{Ln}O2Ly*90`0x>s}}yW@Xys zV>uz^w3Oq3tk=Qa7Shn!QhVW%5ULK^ah5knZT!F!RSt=A78MBbIzj;`!99 zN~bHd8CCU;E|j5)I(kGXJ8G4wJ)FWIj-=mD1U(Ra0P>=ss#<4UgyMj)NB2}Fl*!n) zSL}3kjgAl6467`gd=BW?g;IXjCnO!Oa6YB;6`BO9`+4zb8~8lmCf|Y=G8cMnov%^~ znawl%oGym>ZR$9~$Hv{vVw__3Jua8H}8FOw~DhkE4%^PLEYQrPI~Gq30O? zo~N}zcqH&GRq{=(S3+@Q$kC@@gbc>vC9~twZ>b(Aldbd_ojw9PsW|(nc3>FRlU~^1 z(4O_(q^BW)@3Oj9t>PB@Lkc>+{ky&wn~y*ADgVzNJ3j16N?l>{Cn${aL7!12@`8^) z8sIbPk#=MVNg4JTy+}VYg3KX{i2p^Ok!mCw!7UZ8Ou|*a^chj44=MkZ&xjziNU9Mq z?z!xU%RAyTyua}og-E5ysLzOv`tCI>0_wL^ij@4xXOtn;NIlYwM36pY7%6|R-$=dP zZ;Y4vjph&fjlvK64XfO5wBO}7{CE3}K_vGcztMp7Arpv&cb+C&7nWME$M<# zVIawG)@{};l2m2XT8UI6wMaeEh%_TDNJbJv4vYTcGwSfN0cl3UNEGQo`jKH|44FWt zkU7LiX9$r1l8mGx8R`DeH`XPsFtbtekrJc~sYLj%LyTI4FQgd_NE6bEv?DR32kAow zkr8AZnM7t0<5IdG2_h*-I+B58Avs7sQjC-$6_@%=qnbb+(txxe5u_XGLk5vyWDJ== zrjS`=5%FF|i;xr~1Ib2mkvt?HDMm_>3Zx3DL28jYL?QJ^!(}GjOrRBMN1{j%(vJ)x zBgh0Yh0Gy~i1%$&j3gteNCuLH-NDS#g`jH`I1erjlkXd9AF+EHw zKN3VzkaQ#y$wBgwBBTTRO!%4UviSL_#LjBxCt%bPmc2%P>Ph2q(lj_}0@#}g z>lMz!_vMVgWR}gTkBH?3GfbF$$^MyeGAp!M!mFJ^jVt^{q7x1h)=n6%jO8$K{cHU8 zh9?E(VfQ=fO9&^~VXldl%NcN^mhkdVgJVy{(CUQwb5TZQIXw~mgx5HQjuAf32~QFZ zIN?&9SnGs)>6LX(c!==%)NlXHE*Gi{BTB@2$L`7W8y7ekyzBvQaqPi!>`kOkcI>Ix zot|)K5?-!2XW5=-@Ry&K(|8QC{9lUOs3W}m)9Uvd7dlF#lZL!TnD7S2p39DRkrR$# zPjMWrz|prl;U4T4JK-SqOPussgj4yyW^A>eM9V*^6q@EJBiJ`OnI{R`E%UmJYQNr_ z=deliq7&!ctS4X)O4<}hPlxi8>CjgAC&36kzdq>#*IZ~@An7fk#r5pU9_HBcPYz+W zdLs`OBE`tuD)t)6$#=>saTsBd#E>4O7wJR#kwIh#89^oxKht3fZYy44T#MX<$j{{*Kl8=-j)kr-O&1OUFCoqOABEjt>LaLA^q#Kz;EF^d}g(G!HJJN$pA)a@zg&@U9 z9TGtX5erGpp(l}AB!Y|}o_FF5Qi9Z&{L_pKAr_LngH|IINFCCIbR+$U=Nbw_YLGB8 zh)g1ZTpU31kW!=)X+j1O&$SF6Qi>>~73oK&5aT+(@zHetDMCt+CS(YiK^Bpeoj8M( zA+<;|GKK{6$b%@P8HpkzNb)YqK^l-KGLBeC*7cNe{qYBqlirr#S`&KWqNLDTZ<3M1 zG?1-O)?K47bX`i)N6ZCW{vG$mF|LAyBd{MSN^yuk-;6_FIA%w4iPzjQS6ZyzEQu7j*VIr$YT!NHS zX$6u>A>~LO5XngL)MB}y>5{>ki z{KjoxPc%O9@kHao$wcFoR}+mNpG-7%{U(vF=0DI`nZ)HL$tXoC$zO()A|+`RNk+36 z5rs4$)kp+sL28lNO-aV^#v~($aL-$lxG5zWVWbHurjSA;56MB~r|Ac@{$X1C5gLP( zedPE<7biX8PAs1B*~e$mOz7e?E{hGDL)SaJN7%GE^xqCU;o6pSb~Cr(@+O3COG`T6 z-Lg6Knj~9X2yNJ?d7Z;Mg~np&5r-WPpBIKVhyEdqYz|$p3C1?_R}^5+=Fo>7e(CMp zU0XIK`Q3e+Lx01 z^j&-GQi{#o68eke$l0QcPrVFdo~>+j7jFr@=Q1KIwrKAfC9>L4Vxlx|3H^{12@$8F zkcBlNwydSR4XuA!%R_SyZ3(^mZIn6g6w@HaNk@r_GP@=8JxKw1CZKU!g%^iXGm<=R zk5*QGj=%5i-x|6(g9L%Cp$8n2a6@P~BgyMd-YQQP7lej*LNJp#LzaKsMv}MUnAT6- zw|ukWKMLafS^jYwE4+bk|KDi}&qM5Z_$+ne^ z?p((>D8_t8nGvOUs~(oX$e9v-X(L2O}XpopLBG&W@JDq zIulBV%0rprIq)s!Bqh!wi-`Q>&HIeo8l8JC;pLy=({F|g&#{*|`6OM%IpOqk(o~&e zuQ?}Nm&&DK`KSJzh`e(uYdpu^d`>ugPB?N-IP08>yRo;e(f9HGbJ7g`E1bc9{fF}K z7j%oQxi?!m{;xz_-{+2ZZch4dx4XMBv?D9&;?=DOIC+qY(C)0HOIFLSDAE*q6m_d? zlp_91oGT!WTwdPqCoEPO@ynE| z40%XdBbSOb)>t(!85tQFxqe)5-IlMhF1e=jpS#9S$w+_C86Io4Ua!;pd7g9bx#!Pw z{@imfW9@He)_yUwHk>YhJtA>_>C6Bkf`-6OdAcw5rpJUBg6;e8L$fC zs+NiH`u^F|`wsY%+aY#7AtK0-N`%bsO7nlo&BCau!n)F*1VdC1@g*_9RxkY_(6uZ4 zNduHZ#3zG(&MW>Y(7ST|NiD|bgWd^xJ?bxj-jwH0@=)IldhT+6VgQQ#b`WCul1^X^ zYc1z#SWweif1<^xe*)tcTm0F4*81$>R5qV7O@?KMG@k`#T24uugNmQ@Cq_*Cl0=9u z*RoVT!$wwGrfb#)m1g>rR%jb>moK5_9m@_M&>WOl!%E9izpw~Hwm&Jx=szs#TrYI|00%@Z#m{4#&ojfy#J>YpveQ^?i0uu5)4A1Y&kF{ ztRhAp<7wv!#GmAVx$TBO(Rq^jlU1Or(il$-Mf@hvDyf5neLDzca z_aW%%nlYZn6Y0MKy$7y`x_hqwHh?T7C0^zF0TM(*2Es+e1tDVzF@P>yOXzb!kAn>F zL_GuaSlO~80NO`82AXa(Sv8pZKtF=|d*U+Law=fP%3;V2ZsN)9M=&?R-6K8v7&2xY zA?03r80bTg{&sO620ak&9)Z_C1-cCS7}VE;J}g<5j(r%qVb551M77^9ZF>)lj5sLM zeoXub=mA>GfpN=&TA>LO(0mrmm69cN{Mw2J$a(^rw}8+JdIRbWpeOnKqJIE-zn6Xz z^itnn;{OGDw^z##f}ZZx@LFiPd@p@U5UC)i;ey*eu~4+FS5IVv9^lnBt3i(k{dW2) z&=WwPgavK}JqL6H>YsvM20B#N(-*yKuO}L)VDpZqS1lTXsY)56bbH zL)OBeo|JL;M&v%q?#r<3i3(#?(+(etI!;+hre#y~8P=6=Nu5>`WcIqK=zeJYWryw4 zZqW%Z1YlzYk>FW`M80tN#=SEr8-CU@{XRD&MD7H=%1fUDdZAbP2S6WqF~GAZi5t!Vy>op4F<^y^phuTl&d?5GZgMPbmPk|omrLP7(+AIBwpjU%em1E~r2tD6& z{^5O@2{UgO#-EH`1h?_hCxf2krB4Mt*-MWFJ>5%x5cG_hmQ%*%EC)Pe!A+lMOyJ7C zmKlk2*=vtkiW0+^?QzSF#ITCu41dysrMWi~X3u*ZSM zpnJPoHt2;AFa8s*mk)Xg=xsPC)`0H40B^{I`Og5ONUROwf=Wn`1Ud{Jk6sJ9_j2>3;^@+k-y>-Fqu|0(9@K z-Csc`ur<8hWq7A3evHJP3wPuLMb;kAQx=$rpj1^Q=Dsc$#<_2;QS)HRzX~^Y?5nL>FuZ-B4h; z@km%j5)7tfyzGx)t_B^ZV^21J23-r6m7xAH=v@$>g!&htn>YP(+rR%R{Q&4?aQ*Fe zbAf&i(pSo(`CrR|wJ)c_vd0v*vKBYcVo=}zBy@P`FM~7{^mH%%SpuZK9wy31!-5P&zZCyFFjp5@QXiec;x||}JsMWg0)@zdoq%YgC$nLH;Fakb z(6hYsZJ?KW>CK>bdg&dY>%0na4s^2qm#2jPfUX6-85{FfHq11wV9=vs5)@rr*5^fJ&pF}@n~cqm{n>P?`>Kmj9B|L7^0X}k?R zkRZb=!MDHCRnSYl;sbL2SC0a{!7F~=uk=jNJG|lxDt=`s1O4>IU!IkI4|)LjPYjlL zKj`tG$D@7%^b*jusQ*9E%e-_IbZ>wC3G`N2V8wrd0)<9o6S5iEhU`ENATJ@U$YG?MF6S5iEf^0>$A&(&2ksZiR zWEZjkVla1$PQ#DvJ2UbJdNx@omVaFmeP*GSLT+T4Vq+5E+crRiL0p8jz95Xk-jB78!?(M1^M-Ct_A+1O!Qbi6S$rGb}9)Q##BayMl1Y|NYOHhpeJTw#| z%aE1GT4W=#1$hM7i9C($Lta9v$PuJAYqY?@K+pI$pdkhsk4!>lAajw0$Wo*kS&eK! zHY3}R9msCvxhx$2188s}hmrD=qa_VQ>XFgNIHVDoj?6*kBTJCw$Vz1OlQ{nC(a?x& zMz$i4AbXJg$RQ-jMjt>1B7>1Sq#kKNMpmE@jf_RcBNLEDWD+tNnU2grW+8KsxyU?Z zKC%#5QH(+fvJ6>{G$SjKRmf^&EwUcjfNVrIA)Apc$ktI-kTx_NLAE11ke$dbWH<6O zvIlt%*@x^$4j?Zftw^VbF#c3D3?YY+BS`YpXy3|6EiwQZhzv&Rkb0y68HtQW#t4e> zAB%=KWIQqfX+$O=lacAj3}hBE2bqh^L*^q3k;OpI_%A_2DY6V%jx-}HkyXfQWG%8D z*???BHX)mlEy&iVaQwHS;Rv!F*@5gtb|Jfwr;$C#bI3kqKXL$h328+-pThC4qG1R* zj2uCVccNk(ds0mwjPFj9xqBMrz%WHd6S0)<#)95No6fHWeLkjcn&WCk(|nS;zl z<{|Tu#mI_M6v~k0$Vy}tvKm>7tVcE=8<9=OW@HPp71@S7GRg|lj)o3oC$bCKjXaI) zL7qeQA^VY5q!Xzkhmga_5f8=qTQb_W!AKoak2D}7k~ z8uUm5G7=e$j6udC_m1UyOF1nJ)^83=g`oH>_-kDFCnc+ zCsIWYA%~G8NCMv>kWCGHNOhP6j(~%j- zEI~2;bI_2B%tPiQ3z5ah5@ac|3|WpeBdc>qTc8$Mk8D6T0zKou2@TE27Gx{34S59F zj_g2oBD;{?$kWIkppb{mM;0QBktN7dWErv?X+~Be ztB}>mT4X)40a?+ALKCtX*@A3Ewjqxo+mRi}PGlFd8+jVpgFJ`q8)XIQN5cT}64Huv zB30xNas)}1W8WjS$N*#@G8m~t>OF+bT|9xn$ZBLW zvJKgdJco25hmnB=*pkRNq!F2etjI^999e~ILbf8ikUdB%atIj!Uz3XER)>s5CLptr zdC0O+R**_GG$LD&oygNjE7FNnkpYFHHx5JwBV&_T=U`;Y@jCvpTSuf_rkit%qiLo_lDnU2gsmLSWK^~fe zkdZ*o_>V5hQtjw4^dpiwr;p zB7>1Sq#kKN#v$X82}mO{>3JOg$!JJNW+1bWImldO9x@+Uh%82yAWM;D$ZBLQvc3X^ z24o|$3E7NnLAD~>kVla1$PQ#DvJ2UbJdNx@R$M~CigY4X~!ATNn6AYC|hu~z1bPMVwa!&9ri3|vaNyG`PV0TL-5He;4iNpzpOC((|LL&Ks zQzTL%7%7n|!Ko6d7mSifli)oP8HTOcoqHiYY*6l8BEx5Vz*RDQt`|T;;1kSRLHJIv zUJyQnY7h*Ek8qa5_z!_Em70WMDk05+55kw>g1>>Ub_C%|$2P%y_zqeS{$`_H@b~bY zkzh4^z#v!ypRoyUqoiB#7$v6#Kc=Kd@GDBr3I2nUKEWH5^e>0;AMzt51Hv#u$tA&` zD6tCOqQogk8BqnHsfPr885tIYCLa+T165oBoXm(U7|-CtMBvMe1PGQf5-7Nlkzm1B z7|{uqGor5$!mErJ1UEAhDOkZsw4j-h7{M)!#0qX@Bu?;mjKmB6fsq8keT*0d-(n<5 zu!)gm!M7Pn7d*yDhTs>BWC=PM$q{rhQjseJHzRq1!O(|-;6#Oj;6%lO;6x>Y;6$Z@ z;6!DD;6&wu;6r9X@S#dU@S!R}@S$o!@S$2k@S%D^@Sz65C<)&G3jv&{N$>@UGz+em zNQ+>DL|O&k1g{p{BatJ5jS^`Wd=I=_@J|xy6#R=sx&%kS>ji(3$Z5f`GU*W zyyPY?xxq`W^^&W+WV4rC<|UVS$%UhG#YIuGJgDqXR?1Xy}dAA4qXc z+O1w2w4At_F7H;)3|fM|kJMUSBz(7edeAZ-l(Qb?=%A$tln#&5G-%oV{p{d<9>qLJ zq9^x|aB=H`LCe7RI>>0lpk?IyXzhAWimX9P%#c1T!MVhw(dU?Ut4m#@C5(X5Z!;x> zC2`PFFcck>01*brCmtE(v@{Lrf@G(!o;5)6oKznV(N34;)KCQ6=7QUz4(WpX9Wx7k z!G7LxGyFA;4AN!yVg3Jik+D6_y5V6K=Cs@x{yOrIqwHB@;8xSd=gXe|D$pDlzp~1d z@(BGGQJ))$)lUk4Lv3g%Dslw4mL z!u(6^15m*?)RT5z?nf8UCzRuUbS@1DbNJ|I>H`|;Rh>hT$IwhxQLi2yTxgOXXfVkS z`RLhSqamNxK^mvgr$6F`o}Z?NY^<_A4cb3v;Tvj^RW$t@YL!hmS-o06I6mbK^{ADE z%V4dvljz`H1LSeR@zeElgY|0c;AmUFq1Hnj?YSA*46Yf_QEPhBH2ewS{}@Yurzcjw zH_dQ^H^4zf@_KdpAk`Z|Pqy;1ar6XhxABDWbXI0PxXk+|zc)2dh$>YlP z>hVF=sL^X0vXYv~xcBc~y!DZcf7PqGgVP{w+n`3DVX9ZNA?Ii)YN>S&Shu=)=Xh#j znKrHsr1L}T)%osm2Lcy=bfsRE2YpiN)i}2XDrm6syg-^6H39=7zjsA4jiV9|{Izk{|@cG1Px-$^3^8XQ!A(_n@+GP`)wopjdhvY#z0 zU1iTU72XVQj(APev$$R@{y8SgB~596R*HNjFIvCYWKnm!xHg0q8HOMQxEeT+5!xog zVK!wwGT1BfUjVw+#p^<7>ikZ}h^JSAoTI(tw12c(qt{YnsF_5^Lfb+v!f}|hUcKRJ zjL0P2;TnAk=~-e?$GLduL|Qbz81p#p8GQ9>1LPrl<&g_{h(0~&s8`>`JY>w{n3JEF zNFToQ(J^HB!?ED~J`fS857t}MS|<-{5D$8;5?SiwM{H%`7V4+UVCehhJ zHyx~&>Sslh^RvQvvPHe-P*Nw+Yc%eLoyZRdli;$2iKfmwU@Iw4293Sh8ER3T_JyV{ zU8Ect(7I(@_}b;;li8aVb-=EaPNvnAh1xm0i)Jv@#Xo=9UkPS za5|u^Q@6ti>a;2|B4AWQ#0ED{j-&>V^4+{Rl14<w6bN&e{%l%ScB=O zf)Z+;Y$oPOKdk8_b!wuUABv=#bh3NkS#sZmp2{iT<~&#Q6^YdsE`(dTc+ymQk!^SJ z=qS3B<+yld6rIizUA!%dUSrJ7TkfGr?AYhreGfgu5}f>aG|gvPCy$&)qgbqqr%$6X z<1RS5!^@saAsMDRwa>w~PoveW*1?0PQzL72D%sO%4-MOHZ-t^8q54Tz+EPDAyO?qF`pB)uh?|DqC1y<>tU;|Q z7B$3Hr<$CNX?s)aV3HfTo@A|qMTM&|P=z5IPrr|D4{Nhs9sl{nw!|kEv_@?p>Jlm9 zY^W4JLBFY>PTgV0%aNUlxv&hYcg$gJ>A-By@_ph0cCb_QMVud|6+wN4GTDFOG> z3oPJuwb?UwyskD4D%WPxm6XM{^Q8~aX!&)u&&{_#KntO3E)4Rk576acD{=FzIOwj| z)#Ywp7DulIUUY3qN-;gNvV~~EUswBGeDj0gFDG35(1Ua~d)KAdAEfz|U3Kz|-_WlE zwmY|&QcQ(~ud7>~d{#UiW=0o}nFDU^bnx6cbP*#i-a3bdvZD^(J%_Gjl@1;=ml{~9 zgD(EI=EAwJ8&tLIW(WWTL?$_R|j*ED@g>eT6WF`nyG8LAo$RgJLMsW4!==R&WA zIJo~qkmJD*_}Yiy&Jhk?m_VZf5^Q2FGebi~JNTXiy3Wr4f2s|C-&0`VLG$SE+1+E5 zx8~7*$a1Z^e2|wV(n}!O-8^q0Js4f9*14ZuyL_Q}(t#@zQZ0$3cfytG?<&D}8%U?l zOg|6Jp|z^nt;|lMJ1EmN^AnFyJ0os>&_qK649?Ms^mWzWt(-H_)e^g7jPh_Qoyo?n zw98?n_lE(4{&ls%p=?g4I~WVGasS8ZKyb75^YN2)%K{rrpA^q|K3ZR^Hd^`3$7v2* zYvnl^wAeo$7PeTh$65KA4Eit|JBE*6Oh1x$sYeEt(~D^p197dJ=VZ|akPL1ylo%S+j%COJuTU{ADuImpHK+!V1m?otcwVolzq&i?u8Y0$~L)LeVG-q({N+b*^P z@nDOD9O8Q(Pl>MpCEY#~{vTyaPuXG;kx3ps416Tnv`bBZY_|XjTta%*4v4~7b>WaM z*1jM`_GCBRbA=CFAqo;{Pl5DeE+zM_=$!+F9Jb!x0L)4e)d;fBf&aU$u_+Z>Ov2#r zFXt9kPjwdp!FR$iyaYcGPpTsl8!vvlFm48Euh6 zaQ)QYNhzWgj#@vtA_{b8@12WAbL_E^@Ll6!Z%3jh^i1%StJ%soJx`n24m&p#)6A@B ztJoNIBwqjK%b2nK|BN_G-qjr?rX+cGw4R;WrRqHKU&dVj*Tl3$Q?$M}2U^7@!+JeU zZ`SNmeXK!X7uG0j8)D@ri)mHBb|}7Bb$6-FHokNXy|GfZiZ;!*?ot=o7ntOk(Ro!S zc@F)m2Bts}lL|43_5*1LQVa8*NJ>r86T<_QB+&Cz7_A zbcHo)pC>*B;v?<+=vunJqDD0j`smGu7g9QxypX<7CsLISrkKp8Zjjawnj_9G(dh4( zB<#h5vR*V+jk?D@Q(v~q2zRV^hwHaNU2D`DH3Pm(PTro`6HEJPLP84&u zeMZ`SkV6RMP!Bm=f=35ouZY7eqHN}fAfLBAS&Y_ky8Gj`;d=VHLH}_-))~H%2sOZ-kRUI^d{ibX4T64G@ zK~HWj*t`x_Oa6}%8WTR~Pfmxz53oRJhwwKzl#GKj)?xM5t@x2!x1!2<#0#_o`f9%0 zM{j_8&UJ5rb)iO`?cQq21I~i}*yd@ft?m?)4-6a8+M|t><5~=s46u}&y{7i|Joy>a`C(1f7yy0g21jK^0wPtJ>FU6Il0a zruKrFo}P$u`RGRs;5JazFlbxRJo6!aJEYe^rD;TMQv0g5M;d$0;oVD`;AR0XvuS}z z#9wopBN|h-ntGPhsLWLZvzn|QtG|9d8RTZ?H2oHn0kq{VsDGn#0hC-6qTjvEWHuxe zcP?3MqFuw!C*(t?I#W!;&z2R}s27|gYlg$WnpBWczNRdHdx~hYbIzH1;e{9BNtEM7b)WO3rqodVucjmebhL;JPvn?yH?6s| zP&^-jgFA~;pj$u^rL9r-d7j5=)E!X&_~&cX1}Cq7kw#RUw_d;AtD^cUe=?l2dnr+kwZ!H9>eyru%W2_r$;yMQk4!_4RUSaTvBP zF+9uQj+xHmYs#KAB|oT}R_GZE(BblANHW)RS+42knl@;i>c#PE!i5#~8>LidigCt2 z7r!m>=L!FQ6rpl*3`fQ=nq9ckb0cRqxa^@__bO{IRFI{x0nS*Xop(| zIQhX+x}FK2eTfzX?Q<+NjfCs+WY}|vcC3e8!%K8lqt60iuLpN*D4hdF}kh0_H%_`wZSR}ttY(X7`Td1*<r(@mb zbxo6N)N+UDhBH4G`g#uPN)U@Np8vS63no|S&fN6SBtzs7sETh3Q7TJ##e;u`fz7F>MKxg=O$UtOba z#}s)sE^neH!(7{1$SoH(HKWcq%l16zS&&LN*9;pk+C*mD9d87>d`u6<$arIroX{;{d$NEM!S{Ay+X4C8?E9& zyhc3+GvrRy?&N!3ft{xHC4TA^`b5RI{sfZRMw6slt6}zTU*$*(1v%+oW zTW3PoM?)5u2EPM62V-NcA3>2aJv)|tG2sTdf zYj?tt!eu+6GG$6cw%hU`>DmZH0W>taH}N9tUWJqV0qS z_t9Cg#I_UMnRmTP!{T?UC4)QC+M$sD&Dp6I4Suw?tUwGNvG)_Mv(;LWwsCEoUd<5J zaZ-X7y)8zGv2B3t%HD`cjPFoob6a z25zTjw6FVNT}Wz38u$&&L8Ry310(QUB#L#w#&=iH8-|^*uo+LAP3@~8TQv@^j|B?f z@^|w#Gc_fN252vkE*tdO;8*A1DDUYWQB2WcU%N6>7sG5)2j`Ll-0SuWj}ebg_PL}p=)=L~ zFT2Eq0i(pZQw_3RyB_S?2}`68+-s{V6ym>zgIevYeyI8oO_Q!%4z=NwQExQ0KudC#Ysjw|pn#HaAPDhvGoIc1)4 z4L`Mw=Ej#mT9HkvQ#^4cJ@S38wD--Vt?>5XD=2E*;KHzm1;)gsra&X{$?N$H?0!Dr zhl6~1CC!YB!K;0o;%RLNwpF9oU3HH>NH2pc?=VjGiTL?SHpnkk(gHT*Qqq4*$4c~T zWoZ>%CDC4f;`j8Av_~nZhSNgUV&}_i;4lsrgAFybkQLbZKn>M}EVa}4t)|L|e=Nx~ zO;3KqI4!sup7;5Ra=sF+TpqRR8JZ0w{-24?Bca1pt zk+0}uv|eGS=nxG#?~wHLp+6W~O!ozg&3iT z=g%vS)wdYUCK@~su4;e-x6?F2{+&4Bo^GdmC&AC=!~~7rSNxh%IlTK3Ki$|;s$8(s zZGN;+Svf>cQF%L@>GRlYG+0ySnkh?s!fAS^vK)MivFRFR;=f5q#!t6$sdSv=cp{ctXGF#0sVpkX0Og2ditbeT{hvTtsl!^Zilkej}=lD z@8Dhjtej;No;ZfJu_iko8pG;Xwq25(u3iEoTey6cd~9@=*|_-u!+HyYAkf5*_3G7ZJZfAH5o3q@#S|hJ`mI|> zz}^O7{{#HSG$>M{gCCmCKA^LdmG`o2%I4eou^6@x^qVm(Ld#%rh+Nf_z_$PC&Kx`hgdAT>`(?CV%0RL81~|4HQAx0w`(9) zzZG7q@y+vCC_7{0d*-ohHr>Im&SR-ecJTT0;m2n$+4<)AY!*9Z=ZEIA-9ek};`+?c zzfT;~5DwRGu=Ax4vmO>;=P^c>z}z;a$jG)+*$ijSd}smd^KJi1OLFx7{O^e@oL%@* zxt7S9*|^j2_-!_r8EmiAlP=!!2%F6gxcI;$%)s`zI5R;ZN?m-BiS-5hxN4>>gO`dv z`r(Y%hykv+2`^6`1#cQ~Dy5Gy7rk>L{H9g!ty`S~w{BhR(vr@n;q$D|l?^HEOUZwN zKPeo(b!&5wzw+o~Y=_Lo`YV55%!+({HU8vX2=LU#f|Z;lOv`)%4{3?}FG|5O_@!HR z-pb7@*xO@o`jMMI`4KCG6F>18d2BNMiKphVm+uElFa!oc`=OMoow6w@sBpac)sNaw zj*s|}zz=0TF_4&v{}QEM4u|-8Y%Pnj@w{i4E-2LI?NyuAuVFQ~i-khlEwXX&t12aS zC6r0ttd_fZLOxqee^6@j*&ivr#%C3Ohl&(T(JC^U{my5t%t}4@? zXLA{gbnt>TFeqAV%B3|-$5^9Pxw(#gDpMOjUdrY|?m2_pY=-ir4|i3AA-&0ZyNt(eO1GH>Q@I>A#mc}I_6Vcj@Tf|b9`dT%I5(k1wsD}oXv#+yj48%A$KV1()QSVl6)-#KSe!=VZFe58= z3a7~C)AxeUU$pb}dszXS?NrY1Ws@2GlwbP;+s+zb8194NRc2Q@_OS>yx4@=7L*uu= zd2mdRPQS-g4$hSr_5^ul?)W2yQ8@$z2W%?+2ntl@;NGiGeSt4bbR;B5W>`h9K z@wxA?a;S}IP&xJvdzXR;FOBlwW0}I}=EvS+^FfMq^O5&pM0dD&Tno6tT9)8==)0taN?PPa+T;ZNViCPN-QPR|(NG4I1X1g%v3K5J)zP9>@pu4nt~ zyyPG=WS+N64Ssv%ge}HQonPJD!erB8)88k)qMor!d*ufC;Wv#&-2#(;p6yZ7am*Nw|`1%J|75|^Yc5m%kniR%v8`Ne~5Q*fz0>@BmQQ)3MMY~qg-=jvZk zi{OqQuo$Mb^L-yc#~rZp0bodweQIF}$uy-*VqMXC+r(vX4@fh+jm-_Q+veSl`|@_& zfKA!n#?~|Tu2s2lh?%Lp37%^C<|8oUY!Y+8o+IoOND+fd;fE|mqVFk(+Sy)8-{Fgn zfdlP!a`Q2!o3O*H;Ubeas;1F~+oT?MD&5D}eG>hna_tkABgtjz)j^*58A}K06l|?O zV+TN5Kd5A!0AB(z&#i3eVh5PKQGDwW`6cV0{afeCj*{Gg! ziG5?HN$eO4lgrfYPCmUGR>GSup4`pygHOBeFHAJ0nKtUm$h5rACT>(uxRlOrwwba` z4sJZf24MYGBL2!Grm6KKPNOpY|JWF++2u$29_Dk;z}&Il&Wp}4Q$V6^fk`8`OJ(rT z&O6Sq7G|~bg1-UltV&=HYoz0dZMHl+SR6@iRBu?7W4#RCOV66mFP>%2$d}baaQJ*M zF}MFk{R?C%d2P~JolnD1B1lTe?Z4WC}I7KGiv5X zFEIE**2ZW58~oT{SJwWUbu)Q`x_OY#y~t`oiW}r-F0veud zz;rF?idKp)vwutUB_-7kUpNOF9CwG037rh@>z4(_>K`h2S%q27>|*c&HQJ_Jb~7I> zI~U5`*VykSWcd-pL@lAoWl1HEcA6q4D%-EKiIOJYk2E~T_uqiR!kWDQM|O$TIh3jq z2K%w4HpP6CQASsMxK)DPS-q8SXVOKvRJ{am-z6y*q$4m5NzzA}Wqu_09lk@BzGilp zvQi^iB$};U(@HBC%~G<*NYRj=otp!sDi&f_{KrZ!Q<}lIj+1t<3_Bk;Udj^Pkw0GQ z7hChhK&hJrTa|zb(gs>oYU@qwNi9`t;FTNgrRupGm*Dxd6wb+8BF;`qGX>@?&g)xS zs;1kzQ@=_RXXj8bTT9fnN#cMpw(y0*QnkcMqi*Jr=%C)uQct9bL&s8B99kkTO&X(T z@JitsNMm$zXOOg<#afm0JETR7opvdWcS_I%sqmZ{BCUrCr@MK`M5%!3+{)&OQWs@$ z@CCsnsRi~9+&xKZWmQhz7%CkSM|ByKr6D#{t(f#uCyhPi5bx_>R1d;ofjBm0KP@qJ zlTtOz3h#XK8j@dBVZqdwK{<-8rE0!inH?sjGIn%_veh8LT6o_2-f*}iWnO|k7YXlK z{35Iu;+gs-I4kO2G9%?h^=rrQbIVM9qj;m>B5-6`y?s%#ieKyNyquA(gNyJ8zsJT?<^VHn^+cYr_%5nX@&%gIJ{|>R?{iUtcRpZ&wy@EfZLlKO2j;=+=uyCdDBAai2MSqz&s~O zdWPx!_{T}o{b0!(WoWyd4ZDaKk|N_Mi8PQ|cmPLVb<*5%-l=~5)yVCSjn zQU|b@dCX+0I)RN#!ij z&Zj>v8TAHxqUnj%vfhwzNq6ze|GVDjHZIkY2neTh`1Z%8I#y#2ql3gx=SB=W{Y$=C|DR1B@3Ff9u_>5_ZbdtH9O3xDM5M|46D3!}3bu4@C zM`gomY5iDvElkJUxk2g>-?AKdSsGzmVSii(9vTgw%Wag3p(E_dsd7o8^rCX+RjGi& zJf+O8knWJ^zm(Li(!EsEttH30`KE2q%l=lSVViV@%4@`uQuJF%>&qG)N@=y!OKGoS zs*zy+A<%NWq-uCP<8y1lSBhagHOwiz`t54-5#*7v1n=piMwRr*GvA1G06QX`dDtL^X<_7|uah;Vi|4Y?Hd#o=p&G02r5JF}s2QEtc z7=x3()bFJ1kfIo0vJyftgzkHNi5Y@(x^nb8DTC^X=grhA^}Id2P^`kM-~?KH%DM_( zb?}HQ(pr}Fy;65YI;des?L5&XMf&$x6*ym9rHT!lOF9}7Z>M`k5676X8oqR(b*t0@ zD^GAsVTNq$z52?BCKYZm8BMF;Q%mu6MS&U(vu?b;K#dyYo7_^|$^tkgWpDcE1E-bt zYxWd9_I%p?`llA&(up)(gU8{laFrTreRZMPl%E`})6^EI$Iu!GIT|5HtLJpH0A}1L zJZ1YDPIuFKzVsZ@ty1CHIB-zX84BQ-cqZi8?Y;$hiaNAIx`pr^OM1#G^$?tZ3`zwR zo52>)D^4h*A#J4_YHE|@$pI}_ZcY8O=F=;;V@e^W))TY$3heG4V3y-oqT&0QRj`vM z(SASYuoBXhd(t&uzI|;r*w%Y&doIJad?tIn=?Z((IF0o8@cUHBPYmOD6{eyNm$F=y zj{CyG$is%EbU2jY#lzB!plVl5V(`Dlrv|3Q!^Z(oPWUWqSPEquTuS$_bkmp3wtmHkNTw=K&C}$s&@79bR^UFpp^b>wF6RL58M?E1w5E7t;a{&@9^{}*8l9sJ$cvSmI zQ-GGQdqNKV-?(o)aU~G<))VqlAGl7Pq~*7skn4OVX-N{uJWEbx!2e{)`@-OkM|E1# z0-*{*pO`3sOCikGK^{-aM^Eqdy4GHrZMtbUF@J2g9P zg*w|s=Z)9zfdiiu60KjM!h5=DgT09+_?MX8q_aqG+r<^|!he?jA&4`$(o)ml8a;fA zxJ;fw!`h+W-i2`b^yz@V!v9X+ymc!8y3Y>2vixm6`)RqJ-Eb(UpO#zTS>4VztdI@k z#@YQN$P^#_czvFDcxzrE$5vc}ug%8m?@7KVMeIsfs9_Gz?i5P70?y4GmunJ9&zdW< zQpi4#`U}1Ob@^)4hdhO-n51;nhfy{g7&s{*loNKS+9iJFr4MXdkWcc{?x8lVhTqz}KrGjK2qd zc^rbMYajS+7la%LqW;t0;>Yvk^7)B)ZL{OfwBXbg@Xgr#%Ma+uR1!trpu0)bUrblO zTLIt0jZYm9#mu$ee|ZJG@EyJ14hJuLMvi0C|DYUvMqbbSWf)8<43hnSR;K5}fMRnS z_>5KZ4tCJVkFAnJ+4?v5*Q?~a=`g=GijYG2d4E|;jtu(}y{zTGFO+99Ny|Skl&4R+ z3y>?SY-$Cr}WG%Jn6(E=X@(Xwrv{ zO`BFEu3D}JTe+!3zL%=X`V!fP%1^6{Zt?OL zJiD*wjF5QF2yb2g^p^IMYqn7vMcap9yYMU9`(Qg`Q?9-!@9<-X%zX2!a){5OEimD2 zQTDtlFNJq0Hh!%FZWR5h5@D7Pu%Hv=FvuXxcm-y-O)v;IDHpcN+o``1lyP7zc|}=U zDZ}r#wndJQ zO}2@bb>sE9>etrWgK-&rG8YQNYni%oP`q>%15DfC_hzmNr(4G@vOy<48y(vqt~%&- zm2r??v&a$t&Cf&SARH`GBI@O}G%QyQgdN8AD{yiaW?8W0O2=P)Ng0F;2*aoN!8hbD z=|7bXyJ0yD7=iCSd|)@S3|{ty}*HL3TY zlVVbII&he>n6WzPLr+R#D;#gy=Z8AvGML&ke#q#rHb+gYDaJ zKPp3v!Rf1cIP?~ISXV(G4t*E?_z0z25^cTVreXV z;vcVD3Mb&d#O-m?h-p)#w&<0sSK71GPsj`lf8tN_YH0UZD3{sAyZ$8qSp%DirSRtA zfZXoG5^YNCU*wPIT_3|P1HvBIb+iL#Kxl;EKIThuKjuD%<@YtMzU1UFe)KR*DXskC zVfon!U^)5`I1z;Y_hE(oNXb3|JBzzOzSa&l2s)7cA-oH6J%pw0%AOD9&*@zX?B-#s zk_0?<)R#;LRs)A2^dD6=De^xlt>>8^!E|-h!4G^STbU7_*^bGV*cm(D^s)TjT}L46 z!!Xky@+BJ}9E07`Aqdk!4n4%@cgQc&*OfyZausFnVV-atX1sS7D50Oqbs9MbzVGC1 zU2-rx2Y>R_1x^~`=JqZ*Mw}DszJQGkY&i43kT1zg)DbsNKPkUMf2UkMDcgP7g&F+9 z8Tt2gColS&`~cml?E9M>2`6M$-qj=5%TKBOa4g*`o2k#G`(Ty34-Wg}S}^y(A?DxZ zYzkt8oA;cP7e(K2y=Kb3!U~t>m0lrJ-ZW)gIWo#UFFA0cNO#hHG zd{~iPY4`@F9ai^)GSn}xW>ALZ|CU$z?erxV4RBF4Kl5)nB49oo0Qtfv0#B*cRzC7? zSZm+}EMfp^SqO{5fP9!%@EZg2vvf0G{;m8J;OMvVGCo58VR=rT`GdT2 z@+ArO-oD@vTJSxfA;6bxg@kLz`0~H~Ab;ztlYPmMX%F?Ddel@Ls_FaMruW{*wxg@0tyhnybgL)Hc;vzcbEG+`B_hj6U` zZULdIfM3*TjzLw=!YeHw%^XQ%f*Rc97yLB8p(~XTe+{f&6K*IM$7lkXoCPP!%C)f? zqa;6}Zo8$-ouJvq{1(CBehggcG3DeP8edB1DrbT1Bya5zk^rRS3@;_rDGrQCB+aTA?#1$ zr}UZ)>;fFO-lcihKks`DNriAsRj%Hpd6@bw@Fk}nhk^b$pL@4vI(v3lDY#qn3ac-__&sPg~+Gjs3E{UIU};+1ZAMKnYrR3t=FM7kj$BAJn~Mn-l;GBPtV zawcP4)_fZRI#)m=BSW;7b}+-peYV8ubT%LTJhdx?g}#|onZ?2vpSzH+=7b2RcwX#R!lnq5 zq(?FKL5d)GkmZmFNOcN^rg%JKfMmcO2C+ldL0<{M{us;&#zIz4$Da#uCfK$JC4&0^ zxE-y!4lOlrF4BaA!q$^)ViW8J`-Gg13BP)ti&I*s2(OKUF*FVX83dh}wPF}J9HwMW z6&4C}(5;h|%BjMkDR3@_%!9;0PN8pzU}KDl9fkDA;>gFUc@GNP1mDZ&M0Rfkjmt(x z{LiWULqfQLAMR2kqXaywxph{FnIRM#mwFx=QBKSdg8dKtRbU$+8zDK6^?z0RX9#8d z;Xf){W(tY?vOeY1OkowDcu@(TC2Zo&J<5SuLMgu$f9<1%7@wsc{zj&!_F5)AE!h+; z%o-hm!>~L1N*uW3FY57VA(Zo5ifR1R3+Z`zrYGa7(mz|MVcL~UFkye0}5bfX_sXC@1Hrg@@Q^`r=; zCNbj+fyI5HZp{=H^43x4nn5_=K_=zvWx^5OY*K#qq;O8`{I$S3f32)sE?lr)J}xlq zio!~dtBFquZY}_GJg2a*C(g~)uHG~48T5v|KTFsUxWp6LPZjy($}1+{Zyal$TeAf3 zB-O&Fg=|bXxvieh7Ob3jvtDFT2bIe|5kjInubE3NJL$ertTa_E!7xdW>VJFr-!-9GJjC2Ul7`Dh~J;#d?%Zm@37A;n1)!1&Ly5kyp&!oyfSu3WXC^%)QBt*a{PVO z`XY@)X;+o=E@721-LoDu0@er-;X|0FM=vMl-sRnfJrAW$Pg#N`BPJlW=nY} zbqOY=DrIYgP`I%K61i3gaoBqF2ZCG(&6XS6 z#&&Z`Y0A!2x<^a*V8v&_TEQ-*W4*}FQc6|$N|DV_|r@Rci?fNSzuV}G3 z#6I@FW2>X`F?RDSZz}~a2^r#%j|8^rBlXlv!hWn9z4~#=$KHa`6n%3WWkr=v_oQR7 z>C3_ndpqu*(=ly~&NK4G0lE5mN+CiF^(s*XLcYCJ3z6}^LUi>!iRUBfo};~LV*w2- z_Nz+sD?*kSR)en^*C@g3g^kuDJ@hDJ3I6LS`_>C(#>F%fC;C<4X4qm(+=z;bD#*oT z)Wx3uelEtxZq?kIG`sG;)UT#*5W=~jtZIR!zc1jx3(O7KS|+ljy#kxMS1Ef<*uoE0 zDS;b>bn!%$z*?)+yp2K}7qb{Mj9*Auox0ew>UuiPj6`SnWP07#zU;oPFxTMVKZbvj z`b^4Vte4MD$wsIQgu*J}3$F{=yz#CYTPTd>#{FH?o@$)06qpfzH#hDPm}`%+@(m#= z!UumC7Gt%7HmKn!r+dDliK^!g|BXem3kUy3NB)D<_fyibz)m^)h7e`F&}Tw>;Milj z*d_t*@2{&dn`jg>;%)MqLSmq`XMyt%t`-}MV4d$?$_e(Fv>0>ml@o6YVJK+tjm4hB zeZHk*cKTM1siNc;dp<=+U~OYBIqYO3OVl;`hqBmXysndJ1@v=|-Lu#edM%y)9Yz%i z8G!?Cf%PZ?%X&v-PKYn0d$+o~NU&i(-8H4*EkU-P>|LC)7>8NBro?R)ntUs+`Bjds zT8v5b=~&CH7$spt(CJ+Q+YQ+PNrS}5BCC)@w!d5{lZ2`C?&!EA1ScNrwN+2p8)Bbs zrz$M=G-5)s9XEzQ7CioHTxu+;iG>X*b5H>v?ID+5@F?WByjKa5g#_Crts9a(Jm#DJH!x zvaWBHfu9QN#0zk*`&P}{DpVSH%Nr160)tv%>wIqOJL1kDv`U0w(&v5P%ds17={9T>1xzS z88{D;{2Hb&3G0GZye6>CUODqJ{NQ##roN_#?+Y<}{A=pG_XVE!Pka?O5|BR~yEif) zlC@sR{6NUG4y+g0efS#p(0b+54}|^J2I$F-El|=!i#;ET7Ot*^5M5!Q(gZpY&X#Skbx%Q7aUis z9yoz=iYaM0kZH=jR^eH0tFrRz}7{p2fi0F4S`u$7c=P`{Z}=*g*{Hs_8X6TvM_Pvim+ndrda}OLWIVd z0_!I?J>q4e6premXW?^H_B{*l;^HSEPIW%WOWaQ_YAxW~i-)^>-$GQu!SIF$YW6he5v2h~&m6!3In>qO!P zLEm5bH2#BQB5Tj!iDBa+gFRr@8%g6FAy_VbbYz^v@ADmj>P(Y(j1xBu;f{Ws(r*^m z^Tt7Sl|@{^i=qF*u;#Cv93}SijsH}S_=szHZlw}DM$F`I`YP+kh%ZiCDY9*lQz$HD zbY*b)y&p3grDc?MXB09z;H&a}VyMBmkY;X$-zTPXW459JDjLqx!P|@$Uw) zQp7EZsI0e%=dB0&1?CH3{YvHpvEJ8D5SSCfAj5--HAsx(Zy6P5kT{dSY*aP{iI@EC z@WX-tw!pzDU`tXh7$jPn881WIk^vr<2g$@E>E0*H{!~t z!D5{Cs~bj^3;p;F^=7araZ?wItT#<$=OJ5}z=}aD{riM_8Qgc*G1fa`R1}AJKui_c zOt=?L5-Yh><>DmqbuL9&9wI)$rKt5G;xpXnM@9C)bBy8L9?N=E@edV$W=RoQ_+2B5 zNmllUif>vT$Bk1SzB7AAnL0&`@mq*nF1l%=o9Km@+NtDE5tI2#!^*KKV)+E8$nr6@ zW8JWkS*?sc1Gg1()N_WFRZ~S}bd1P6eMUAP={4O}#0SN-V;>V);cX)ehxs=M?YymQ zc~F$bC5bEr=A&Q~cz%dP2@Df|%RQ!C2otqwNcY0TcMT8B!i$4T53!OH_yX^Mk=>6! z_*+Ej5&EMa+^!yo5JSwyL{HNVb7FrAwUnFvo=`Tj!!%x>}S=bH!&lhmsgi z9kE;49WTn#0wm4C#^W{6H2d_F>8T4aWd`R{TX;&iHU(_~rY$6TTCOd?)H7;K%fn>mfl+ z0t@^cXENOGE+Z>}tnX4P6U7@`xUy}bIE%N7$`=d8d6sbWEW60!!&Na!e8=!u7<%D- zjD7lhBb)ai+UtUm{q`js{BaO`)p%>Ck<~-bn5y?yO#f!*zcaGRDI&AOp7UoTv;A2) z`?&btXcX#Gm^VVQrYL#I;)7`IvSjfYVZ6u&LY14zVmR;9rv@)VB`txtu|VYIN7YTK z=oR5Jzs07P=Z!4gfjeTb6cP=InJBVQa0vC^1WEixJ+)YTk@Lr6y&!*))g!zcQZ`Xk z<}bkth!sOh;Svls?LDe1L%hv}z|#kb;524K!6*sbc@Lm_`iX2c5?SUeD(_^9F9+sm zFNc17UYG1-lW@3uRPz&}nd2`WR{fWWhj?yKIk8;C9dDn~yGT^_{k*H%M$Z`%JT#x;X>mSp z7S(T`#)*yTh-&mR;sk85D`Ga<$VhUnviv9FDeIYDx@AX%Ys%D>;z>-cQhQg5ZFslV zqc-J;(HtLsRk`_`_*$ zA4W##RQDZJ#XM2vM1#mW45Iqmwc<3MkH8wGm&D`z)?T%Ko%mOpj=!xyydvDi<>aoi z@fESn_$bzeD=F*6UcRJ9iFp-g)5;!Y+pFRc`hzuZgE)RP2H)|&z-P7Z;F@(uS+zlQ z8Ry}=R?lt_k8q;zCr0M~iMn;8c!{&lgVKM<$hI9)))$KD{P}Cj$wE=#dejwfh-U?U ze~%h2p}EFg#NEM1I0GOJkWfhYN9yLc#i@MA_F8mMNZSWSRspf?H?kav5t0E}{eg1o z9r2o_7yresq7}YVj~9!t8%BSNJ0t`j!Sz4+ZFOFW_?}@LY|U2$mT(^TbCBlCcqsCX zdg)#9Q=YfF)v_J9@cH_0F|x+jaS@V@EbVP|d8t@t%p>?wB6!7z6*aEc8lTuhyQ>6G1ryW-C`{5uEZKC!J_UuJ5J*6b6D418>f8oXb`E9p2)QdGk}6g&99%~;`e z0HwSKY5v?ZLM1-pQUec&R)ZM!wvpZ3tWG^B&NYnt2v@~bMt0#%Y+4K10*QQ6-FR5^ zHyY=7+D6pv4Pu7H8nO-z4vAW)UjA?Kdy)H8J>4w&3;c{9)Qi6ppW(TKYFw*$0oQBX zxcMzrgZ?b;Fk27U1y=nyt`D=+BYzW9EC#njP)B;jPYmL%I3wE{r((|bMuQmh2+BEG zjl3z=8^kt;k*#y67yl)0GMZw>3arjgP@f$Z-@>i<1oh$_ag)hVDhlc|2GeyzWTdZ= zZ4ANAy4Z&ID+uu{#0gsyxDQeb!Bk6@9HMSCnYI|M^AIWU0rVrY%KMl$8^mD~mc*IW ztz%3fW@`X;_>DK9)MJ$+<4yH^=vZZ?&6L8Q#iT@=sgN(du7*x9ZNZ**y-Hn>X(r#= zt8@mLmh+xoHTr(j2L>TSU}u-87lKVDPQdLs+T-#>6Q(srnv^3Urb@n6RMID#?u?s* ze^(;P28o7j!N!cdsICk(Ju6}X*okmc8D`QAtC>-zv&NtU*Y8Wg6==`+_)uesEz~Fe zodj-V(hLvQPsPnPg;?7#Tncz#8Z)4-o^A3sVg|mtX^!btj(bq;k29_2`R!K~Jbzz^ z*;bX0nr;flDB3WpbfKw%-q$52nO+xh=ZcpO>K93-`wVr+J;-h<2W4+g>uo@XN0%Ctoss#g}5Qh;^nH z@BbI>jc`dk@J}OK0U5e!WW3wBC(`44Rkgir`X$F*S5Ft1oTk7O>?G}X)W9Mk{*ZkN zN3wKT^V>NrRxAkWRtaW?H zv-q}p{4I1czMw~qmQ9$yjVY7?TTFKid_$?~ztzMW#hlGX=6XvF+GaXt3H8`akXZ`-JbG@D`t{$8(|{X5edZrp$A z4QyLtVBdc+GO}*5f%T@S7fzYVjM2_{Mt1xjy2*S!P|@=!{~8<#h$9y}h|kpzvA_Gi zJpq;DkS{KiHGeRrtr|BR=)>dh&UCFtVVIvmWgQvJz5Q=nlve(c!4?0@7Us2O+@{;m zeW}$cVVE$3*8*+#27htq-*FvyuzSX|inr#j#Hiv#!zZi9&zkU3;u02)w3+(2m{rfx z-s=X|j}ZCb35X-dz|KMYJ|^3xCbXE@l|my$p3#r^#sK`5wj9WpY@}G1#shM*y9ytjpAEq#;?o*J}#mf=)b+gfOBOfQ68rDF${65{S23eWn$>@4m4{=78Yk7{jxJ z%6`)(u`CH4K1mI_VZ!RBWfO4Ei=FCIU>x}8$O!HEzIBp;B`q|t@!gCCCK~X1c4Xu$ z*a8yOz?-ICLl730urG1@nx23y?jJC)m0;sM1KTuD2^uiv`KH57i6nxBlhm>S(}2PE z?xRSF9@n6WFU6_5hfV7Z*5h-qS$-Tg$d6V1@0l*~;p@gBnb`(rhWMc+^T2R$Uo;L3 zD`YVU|2MfP-43;^Y&+@6m>Xzwdp63_cR<=H5uIHn#sxu?a zmrde%13vcfof;ftJ}sJ#|G=|?A9(dfy!j0i|F0j^yd-lO_ejqzo)ts-u>)BI_S-*< z>v-G%kN4vEI>Enq77hu41VRkAei&&(*o)(h>MW;ugil_7r?w`v`qoXJl|jx!Vjvy= z;Mo~SzXxT8xFOAuM#y1E9pnN&Vi5UPp1E!ykA8L2B6FZ2I0nWfeC8k&5)W}e>afRu z80;C?Wh%{lAYf%L-3$iq{QdaxJHzPhi|6uHtZgx;@M*ou(ZyzWz@@7c!UrKndqd=1 zRddqK&D>NMzNN4dvK*2QNk|;SPFzC`vEk(M7M`sJSN5vT4D)w}8Ac=90O`4m4@i89 zakz(Pm;a7hK#~}pSA!$V;lA{l0b{m-o%&48TxJe8q{P7J#zpJfhkh7YiBB9b=xyDI z1iAD*&rU*eAT5{RZbZ|9W(b1}{f%cHNFO9zRZl%-UTzFZs5LMVZRIP~rBnBF_=b9v>t8C#WuH-fv)Uqj^sJNm;+voWN%ct4G(Gi!3o2M{qSjLr0(n z6riJ(XC=SqS?x!9LOGQNKkBWl@p^FCv6F>19Ek_M?egaZmWU(d4x&{Gbn={wB>TyQW9Eg$m?D?eEs=#!6u z`XJ9L@F|eEI(1dK`G{zZfXAkf(1B~!;7apyLx?}VD3Vi)Qvw4(=W9Hx{s6^8Rb2QW z$i97O`1jS_d(9WQU^hNVG8JD9@rOq{JbJ5<)vGwS=u0B&)w1`^t1wEznDz=%D}ZOU z%719a19n!R1lO8B)po>?oE%#X|)t@qaQZ1WzTomz`$_~sK_qz#5s zc|7ZMBMo@0ggYMYK**0nRSazYkfAl|iI2@w4UwzXpjBLYHA3=Hd4*@EU&Q&bn`g&& zp^A_!$jQ~{JG)f>LuTCHXIH4Z51aG&Kye8+lfgjPKLHo{59J31KQBeUGDHDX0u&JKP@R5c9aY z`!udQ;dTTTA-*#WZ24n68$f^ku@^)mpbQC0!l|%Ht^cEWKNoTY2IoSaVWR@J1CpEg z?+B~l&P-I-{mDGpF!raoOha!%zotvK&+F>`ugn)XeoKV9x!t@2D=lADeb1X$7^e2G z$AApUgHX^g&>e&G6OxSsx^Dq`Y=OGt&t_k|_MM~byI}4QtcXR|eHkMk0{MbxXQMaH zR`!2qUdLBHqWX85HyQlhX}ILRgm3vkdeCj7qm_MsHShMnhrZngp*C#?+w)Xsm${ZR z24cs1rT=f{XQmYy*u`xIb{f(I34Iqg3pfcfaUW9zeaFLSSey_Zm$LqnIRX)XcTe4a z$vl(y8;{wCwCZ*mc2OFSjp>!{ZgV26PVI?utbLJlyr<-b67zTSmk<3Mw@`Sjoe{;e z-N^B52~vh+O~VO{i%DA?h&Onn)-rZ1P7S||>+sTz_zDzupJCXgBNSne7vhriJWfZ* z*%$B;9^6UGGGm2s_d^n3f8rI)v}&gNo1qa~F)nZK;MTAJqb%6|Q+3l-^Zop!EqYl( zK_17H!hN@%u^njG)d-jMj(Y5x`6+|6fkyQoMy{}dkskcjDD{6mwHkC?aKf{?gjc11NOgS*EMudA4{<0Wl18c^u zzZ32Yk^#Gz9~(3cv&qb1DsLK$9Cp~h8dSF5Fvs#|pHWWTFmE*k`We(E9`j-o-_)n< z8#EuY<@AX=*|go~#EOb5Le(Vubxfxohg~Jr{2_CX*)PD8GjgBB^1uV^`?Ud{6(edc zXKCakw)C({uB!Qu<2-kJXeGxu>|z{@eH_PO3p#psZBKq+(Kzg?JI;d@j>$$#ft9|l zbzzEu1x-~>8ZGNHnKUrkPr<_hlX z$_aaed~JP>=UkdApfBgSI?d(VSMywt=CbxFDWWCFC$n!f@{hPSj`ocx5G|jNU2b5% z!PCf-;9mSEJhoh&XR>^0zh@`t!yNzeuo^erGLH*rVCYwf zzZx?1*}^q~y=ASEG0hUF?vJoMXywAx(=nD8d2XsYbB-kom*}g?_D3z1SZ}R{&9g*t z{{20DlLxb~u`;$5jaf7X5P=3EpMT7f@>#2_}x$gpRl}Y;7tbg-Q|}14VXu+);(?6Ch#6Sr+C(q z#T$LGeVbbPoaL~Azk6Mo_ktykzj$5Q^nxXUHU?>W!IFrres-^hy6L)-xY|-jTN(AO zwgklmUSH&7(~N)NCcm5DPGsL@1t%oht~mC@2KnL}5C7M1?Ue`i48e~UDZFS2qD2bX zFIr;x=00`zicH!k<($t8cLFSyi4)mq&+CwMtfTNM&pwm%Dy7Yn-6@C%k-~sn}FNFT!;&j z4{;vhS;8;Xn71s?8n}zVrMdm3iAO z1$;xV(y-kUPfua`p$o=Qn24#YDznswjKZ>c?3Lt<${U4QBS@yr$NmJie-X~*1TS!o z!YXGqvfP4)zN0WLPK~XwtmbHbUd=8fTXtRb-)&Ju?l)?EwdD^Y_iJ_Ihs0xQV!g%J zj}P3aW*)bk7x|3^YWQy~(=7sSe9ly;D^6Sf#BpV6_!-M`+;U?7t#2)%{HZ>r?psTi zulYKgCY)nf^l8E_$ck@=CC7JFA2#tWSb&9VCQo*sTGU~wFo>9|&*CLDu+w7TEE4X) zaCeZsSqb@zU#c{bcQ;Cf*V zS4w*rn=t=<8aA>dyX(gHF22<5oV+H%Il;aMrCEhfr(kzY_|d0SXnl?eZx5GhU+FP< zGPQ0_G1G5ghvXc%BwWgtIvvktA#x%W#FKENbdlMU(?c81he40U(%%|0^#=UIq- z-LZ=)0&Jzi;dj;|6UL+;+jcYd0(az6g*Nu{RZ9^%8Cezdi)h_~erjC-ar3mSDec^T zOzg(HjmcW;Qiq||UD5AYw1S$~gzezTkl8Pwc`+&1lioib5%RSRsZE{8P;cFN-qyu( z^Qr~Nn39jCqOaBtHMv%i3;*%+yDT+=6xN4NGI=)jEp(dfcytqB@&xzI$TxY`_hmZY zcH&Eaumtwavdgn3Alz!44zg1ZXN9#{vu0yY@GbaK0kYC|geF&{w*NELI~*<(Mqv{s z4qipEld$_Fa=A7KI}K(@FbBcOmr_>^xVFe?D5b9G2bWQES+B85p;Rx(SJ!6bA(=k0 z>TPc!TCdGmXY$zl%$2gYu+7(Ip&w9T{rW6bKUV4}tX-N_{GY;J>|NlL@Z<=k>F8aG zdQlnM;Sx*9b+(u7!S_mwV~Z1K%2H1?oQ@niX?FK^h16tV z3xBxFP&UUZa&7Bvp4^qO$VpvhDB z4$0Ks57+E>Z9#wc!&p*^%pFt&tva^e=TXo&w7*IF1cZ$3xzTPY5vLswGg4utRCp0C zUUS{Mddci272#NGxI?D&d>8(w|1;gROmebtDzq=H|% z`eSeCy!se2zuWs5+LGFPpqJY6D|}4CRg2CLG~sro}24!%PuYKaeQh5 zCpltlang8o^eT>8pE0W0D9NK|WuslArx%dN0eE0R5j@C_nz*mMV#0>QIB5T2U0Yzn z9tI}NkNq($`lI_0!fu7N=0B|ny9w6a|6^T>tHfG_Ex2l$;Mw*c=IJo!BCw48<*r(` znES8NS(ku)9D6NtfLMV)7hMmlz9W`1Mx)V_+^<|T&_8nQa%8mL#D+#nj$n=83-B&p zu%^)_BAtTO#qoJBDo&F%k~hDoY&2P$cqAWfwubY0*p1AL2+R6ZyraaF(yiD)#>aZW zkT987O{1;P7-(fzuAg-?Eko<}vxf44m?b~fddW~f5zi#YTBn*gry4lHx+v8JJ zI%#N`$-NNU&PLgvbs4ZZzE5T8q(-kT%)Sq{<30F}lC_%i$6LZ1=Fj~;OAE%f{&oZQ zxljuxS$%oD*;nMr)k}@^i`V==y-5hRBYkh6}(WakFcKR;$7NO zu&1477w=i6&HsII(F@7EN1wz@GZuKvdEv1)SX~b9$+(u2@?0wKIS$KKtn!OQLw5Hl z>mIiL)-nVCUUBE1ut1HAvL4`!oTmy!e|L{F#hEJG zo?bvScq)379W$-_5Q4v9B!(&BG1f01 zfU^xZZe5TLh_=yzmR!N(nGdSIk6179;`!P5tnX}fV2+i?AISk_c&_zh4u8$#tuy(k zUiEyu^*x@OrL3NBUGf0hz{oIVjN0Ne0}5wzy+3&5uKDw3XUPoZtNGTWVP@>`F%EC3 z{O@sCnMd1ubC}6AZ7rwm1nsbM9^Zc6gZGc0CRlrB%9!AOKSFQ&f%9bbfA3m_KO9f9 zKiI19D2Vf9^iM*(1K7a?d#BKH3Quc~ioJ=ATo8wS<*+s;Z1lNUcs1d^nX=nR^pJ z^Y(Ics~DTK6C1YHIJ2FL%NOj-C|z2%q~Z`RLJJG|!i2r|8^5+~DP)t9oVoWWmM`4- zSXt7}B==)`(kl|H7w!$UNsAXmRy@4x!!eQ72bt9#w6K;vQWjGY<9=jMWc9;)LxcWn z!NYrK9SLba*Fr>A%_(y_t5|G#tb5KLHet}6g)t90hB8M|ignN=B8J!5aK?a2iW$?m9}gp~!HC#;|Td(b;`S|vis z(t;O9^`yO=BO&oO<3nwW7tr*v$#!E5B|d#&+qwCBsb;=*t`aA2%zY-_60{H-{#sJ8 ziS+cGROhIKwK{ODCG}|m}6?`gFC0XAKXKyUGLHow~GeB9}X9vkC$n+=u*tsVqsx5C$CT0{1loA!$JU{;uPqEOG)L=2_bVS;MoTBiKC39M(D(@1{ZZ`u z%4xBa77taLRTy4Wpj9aVe&fo=mr8-LfZe z#@WW0_Q&>}!v|3JEZ&>sT#%Gn`13;M2r@qZlgCRJ?X0pn%aco!-HY}(s~<;KTfBe{ zTS|3;`&s9#32D{evA^5WrRrG|tJwV$T5Mbwy=dk3(2*Ra9Z6~ti{;1T=x|T^_i;ov zw$ceZZSD#Bam1ne@h}FJ-(MC~dH?^^`T3%dqM9OV4?F7oK>5Vd;PT+giB)#@1Bfd< zz2L{=Xetx6I@5f$|MziJ6Mu1TL3QfhUpu+z1r>Ek^Q)Ka&7Cm6BBeTdZ+22j#e(XO zu$z9;<&3@2RSE8T=MtQtOREpzM6Gc~SFc@|u-6#>9rM}6rugofw>O~tCGMc)Ox^nl z^V{`MkUvbHaDls0_fEF2UHGIkXxC#-JGA?QY~^*%{m$8YXYYH~8R2@#C03YriIsKi z!!ewF!oo5ZS2lNNoO|vbI$2HH$tqTxyeF$z^=R43q$(C)9`Ani|8cTP`q^r%5Ozho zUUSj$rkCrSCs%tE@MDMj@pLp+3;*YI9GF0{r!TyCZuVZEeZO{M>HZ#8&CMTW3PHEF zSnl&lbIvSywcwWp)UtG@P&b`EA>S3`dPZ`Q67I@n)TO>2N;k7cEP9#63U=CQ=ISG$eobX zKIR!V8HKQvo6F3Vma1bx?`s99nozWWGJiVh$;Zx}vmtAKw?Ua=Tkf1)9pIj9OQ@dU zTH%^C@r4QVs*O8^vLO4ag}&8k?s~gYjc?QR~nsM(bPYpd>qaoX?YO&nHo#A4@uiLXSeBN0nR4 zD2}yq)PHnhI&q_`D3$IBze@bwIp6Bpd(WX8hx=CeRGVkd-ervTshGFdxB7zf*G}ri zzrP%`*N)QHlvbi$>$21h4XPt@YF=r0-J404^1Q(s zT<}BNiWV{V8%ZOB!ri>O8-qb326x`z)E?#7Qfro@;$B{zVUK6<@*Qpx`8JO9@8Krr z)fMi1z1%bSvmqri)0%Dc41P1D?8vm{^3@~CtxW62tk64?uZ;Gl8+>Q{m1o%6D{Jt{ zv1VsJrZIN=;&tCi#$LlkzQe(>LgLA2E~YNTk;00dkh-gJ?77`cL}>Y zQ9b#DH7HF%;peq(kOk%1t2Nb;?641|%8bh?H*hGXqi)bQvVQ z1T2Iu9sY^#k+TvWb%HDt5#X@KdbtC1Az&9s1tGnm;h>!M?0}RFdqWN~E|BGt=LV!K z=+UcmdrR(slmmVHYJ^8Yn?VX+>q2-xSpeXo z$IbyM7yi{CUvHDQyS*8JRIp-@3f7>pPEOjR+v7p97i+ASYd~tL9@3?TdO568uR$GX zM+S}61Cj&u**_qKQigSu09ryL5u6+@==iNeZSD{ zL@H=ENcMz=0ZB`^84gOQIm^neHF`^lld9+1q?8$hz>Ht88=gJh3Bq1y|;7?Av+ zceLvn%irkX3PEaO`*}Tl@wd9Y2&9&50q=f0B4zzW(mgW%G9V4X&<&D@e9@Z_*w9cX z*LZa|*aQFQtGa&#*bV(~uddgEUC>+lbiE1egl_Zbx*ym9-Te<;FO}eEhavx;x*->& zrgUknlZU`&_!s@4`)>frzYwGvF1rUE{@p%<+9Bx#4?~yzbiEL)gYFK{^-{1Fx;=1E zJ8PsiIH-o*ARW_AO+O^F@wz?)T}FiL3A(-vEP`&lU)S#j4N8U3^TTyL7u*27;t}2M zO498)AlYLVQ+^F{K3D)<3d_(v8kXt->p=2oe?qqxFW2owAQiL&biqIQsX-|p?081^ zZv*Mr`us%KjbIl1TUP3N6Sxd|!E?Iq0uT8~vfQ{zH#C3=2vGWhu9txE&~2-A-4CRJ zrq-)hgJJOBuvYia2ScF`q zpA8luzYee721@xb*moeILk@Wvv2LQg0i@2D4`xvT-y4*&L3yWcF9gZnRj%tD;4(biJfX8kC}8*ifw-@ZYNT~Js=&@goC;s3z`wmRX-^CfX#>We8Qx|dH@H=;NkO$t{cIIL!x|! zh+g06wYP&*v$RifY`;OvfRtedNZ}5HRO8}b3`(u=?{3ijJHZy{UBA@z4zL-zbnA$2 z=mV+5*}u~DWgsQg4%Q-I^)cOE0a8Xj>Y(I?z3~g(-T;!``Wszu1}VP#q@HdoSPcKt z-|PM*pbTAd{82a9z#d^14Mxd(6?XL^%5{1dh1PHZw6zb zx8KtBGazoOUAK4 zo@3PQ9zhS^EDmW2E%VporGa!h=7ZEn++MxGt9OEwemmIJj)865kfa&>;OK;*B~TC0 z1kxBCF@8vDgFVSMB(;NS6NYf9#i0q(!}m?p6OtWz#_bv#i5kLm>~2B|>JAcZelfEuEyJ3+FuL_ORP zbjrBfN%lIq6*ri~iw*Y>R8z*Y?bdx|zrwOF+J(}JiOXd&s zfXN_TJ_|t#P~p{EG#--u_UkoR2GTgu08+RXuf5|#J)<_zj0!dV5}gxl|6I490jWS0 zC-ii}zR>IvbHG8zs#PO~%w}Eh13SLPUFvVVB?hU0+fV8V7K1JD&;1fj3D*5Vx7UCa zuG_13f=A(>{l_7x5lm|vlA6HiuZN`OuW|oheO^zf?K?ezR8_XyQQ%Po%RX4Wx94 zq!)m2)X8qIp%$c?5y{>NQq2l1(Caw4hKQyuu-?{Ms0buIZPab8NwYv|IU?CzUOSQW zE{%ue93Rpfkjvok*ACiqT+!ENA(W|3SH3V}slW$_Pg+_LAcF zcoq;OFW=z0Q3HH-zxUf{LYfK-7lublGEABIDp%ze#ynkb(O4%t z*6MmR5vOw!s2$S`uR#W>GZm7Kg5A>Cpt18D?s7F94=nQ0d7wvtWRC)=W*HjmL1T9`R5iM2NlL$`N>q~9eSw_w?uh`iiHM0)l&^@0Vw={^5LVW6fh0I4QL zU>Y)Vy+!&#Sq77#_i1{)9J*N#m#q;a9nzti=713hXMYFhIT*I}wiFE}zk6GX1&_X` zhdT_Cy+kV0J(?_km&H5J=&DcIn}5pf(` zz5=8};{ow8KS_$-t=GHto_t8fu&Ljm{W zA_PW#bXyt%yFpfr=YSw(==U+=7X7kO=F#$ z@6`*zCWI^U>Ta+RdiDR(wE@Vhht^X%b@DP|okcG8>PNwYX6cvm5FB*u(+(j(y+N)Z z(gQ;xYCh!EBMwsncGg{DS-&WV^TVOIGh_A$dbpz?wa^)_-mbA;W`EGrjRz^8 zc*$$rXgKz|wO#dqBUh_3%M^C=njbAS)K-KJYNJUQWGbf*X0DdWZqdd7v{;l2QRL?`YSKq>w&xRHS4C`dJH1gU1kcCZEP@Uj#1 zyLemb0wX}Gc@ju1lS?{I-wu${AyPU8f7R1T>(bLH{TssJv}KpTgAy;tk^`M98>ABD zfK(tCs3r8hooO7P-2&Z6r|=5t_42JAJ)HTf?r#GR zN{E;Q2h~6(qG>z8d_>IZy)6}h(bx0@YKb`AY<+sT0wUt|fRsVNCj%vbo7mW zU9SMCU^O7+W4nRV&yEb@;h+TSKrJHZ2YaUn0Sjcg<{!Gh2PFU6n|eY`AnDzp)+hd{ z>$xBmtih}If!d+ErQ2O#(;snfIjCL#jdB(olu$9)0FUiNl(fQY@AT?-y}Dya_fG@2 zBYcKe&jm}M*WK3bHFt3f2mLNs0kR+Tf<%F2Pxt}nKV_VwdElQANHzAkr`uz}4G7o- zQVrACuv7ru!40Fcjg@o2eCX|@(=i4qTnRrc<-ne6fR6M^jl=i@iob56ZqEU8q1Q;J zVaWwYqZtWcjE}c~xMOjF6rottdAS43f?j8(jOyfOa2fPvqjbB?hwK>ZHP*>T!8G`{ z`sxMB_XAPUHjpa3eXOoaXW$@1guiYm0v(9R?(^zI)S!uont1{!BU~fK>G4WIO1Kr& zS^(@jL;s9)|5mW!kXgKqh8%FL=Tq?Y83#?Peg6?pAL8e*fSQUZ0d3~Gl$({W4o;IP#52mEaZsqYLC5zZ$J#CWj*q=FqL9hcv* zX++$JfNjpQ#-#!;Q{EI-!xYMiW$LsOhz%0t}QJhs^MgmpLfUB6K*QjoR99d?y1(B~C_TKi)>tPOuN;;#5Uy#J9?twc1}Nx~4;)nU!?}on1iL`${pgj(qfPWRUa{S`zFAso+WLD1N1+*i6)T?yS#q)V-Elpx@Sqk2L;U@`REV|tCdL2A+j zRo4%L1@O;4j-eN{{aW`g0m+{Bxt_2L=0h*}LbtcJzJs5<9?#`U&y|P^&rVs6jxhUN9M?zLWfWve(PmAnEy}W5|}h zdIxAnx;>zy70>_E&J0VTWH^g}=%ubdfw+viK^yF~=g7{>cfkPY)nDoMim&yA8rvx$ z9FlMJbcVn%=+Pa+QUq85Mu7!?9+slPunYRd&IKNl3S?RS4i1#G5u}8+cOpa7tOlf# zM*Ky$4}nzB_`m8II48yzzqL#t*9gs3;{{j9uOOo#C@k@vpfFedP z4#58~NCiCuQiU_v9Z9%S_T>ZcO)O^$$@&jqo9`Y1n4MOSdeaC22w#j~9@ltIn(JCYw*8-W6&rECxB_M?%yJ?-H;T3_*tx}$a0Xpq92qjfz3q)xeA zf`e*coJ|Sg#sj35Xp7P9@sGfc&UX|f`}R4ci}Hpzy`c8F&@pQ+8l)GEZqS1Ydl`MA29VrVDh}mF4#vLigORBL4Iu)#7=^bfwF((g!WUtOd!uWV#)Dx6Wy480dOI zKZYAXN~jK`@ENQ1@Ch&K_U#~zls#Yo{EaT%9tKiD3)kp}v^o!`A47nEb?9th0jSjo zBs+T<5zrzz;N4Rw=&O259R;a|(HnICEU*XuLxp#wKG3xZoe->j3uhM?fwOD~YNa1T9d80$~`SV8jT4DKk|vSRgn7%2+V%6s%YITQTldsk_lhGW+@6`>H+1v*+wP=hN@~{qFDn zdH3FT-%FY{-8pl{8=}nN*caW3zGOPkgGz%1QAWi355^xy#g6As+6&ZI{<5iO=_@Dg zB6e_II^fAn*(78xSHwX9_4G+QPeAl*ra%VGVXGgy{^w8Hv)JAjh)453J893Mt81nn z|Idy8#4n8ByMEH1Cx7^trs2VtOou`hE_eKP&#$?DNeuqhoS}NzWO#mO{5f=q{JH<( zx<==#C+!t9{ri)4360iH+Nh&t}X&-1_xDsx#Co5dA= zYfaDPCMy1kxar`UC=D+4*YwoX?yl)6Sh&~3kKQNcTQdc8-lpK>J52#AVG8uU+Z32h zngTUc8W?=9>CkYhrl+j?0XYL+RA$Tx1pnHcap-SqW@^UGiDXe3iSUD_gVU&t#PCE- zUjHA>1V3deTAnl+2=7jMt2>w59#&aBe`-m?vyWir~%sq3LzKxHn==nM_jI_ml=NS;T&Obsgj z^-Hm7prf;HPh&@4WBi`$Onl{FT{k>+y{W(JO{U@OO?5prn>X2YJvXJB>v}H3x7YQV zmE(0i_qjw}4|U<8x*mz#r|Wv?rY7rpNXyUG^-%YGr>+;{;CJhKF>a!=ucW_M*H^(N zTEQ+oSJzj;><{YtDoER_b-guqzgX8>Z0Z+vz4Zpx>v~I0zf{*dVBuGF+hQjSRO+@L zT|{MXAKIwvdq?u+y1o~5{;sa?9evfhzIUvma_6njTzw56dNUVMe|kNO(AqN=~48&+pFbi<<3um_bXN}=-p zLK2lyaZs;k_pW9%Hx&9~4Yv?&1xuBuv*dH(jy{PPrB~-40l?xks z?vr~OW(T~8hC*EDs9a8wi%mn_sC3A>)U5}VJ%7E^G`w_~G|Y~AIdSs-fAlpbFoNnw ztsoTB z9e<-KKYg9?i%PwlsI*f-hloqOi4I6Zt0V?-#BMMZdv7!qq;G22z4+&EHU$f)R8&EG z@K@0&>V31xSE#I+K~&z|%AsNMM-O4Mwdc?f_6bzlEgiB=gSlH+wPb9fvUrNUrouT? z2cmN36Sp^PFLAZQ(g2?W=xf*>>>?_08>qrwxWo9pZ#D7NIEx(r;1P2I@ST!g6fe&W72Wg&Y;IqeExrGv|&fc$b5)Jg;vunD%8W; zkOCbU79l$O(T3fJ_I|8k$I;L@ix%}fXd0@0!tF4&#K%8r;sX;VzV9IuU!IhP`CZb( z_-z7b9x)kHQ)F!LZa6BdH~e?l3}G6T4vl`sbg1N*2J%^RW;s!A%SBX1Xzg>RL#t?* zavRg8p5DJV{wOLVm`CjZ0bMh!est{f?Ad7IF;l?$g7HVbX#692DaZ$sP`OHa9%t_- zeh?LZ0&T~hL3c36>)Ox9^tAGp%u&;UK2)Y8iJCbjQCZ|JHOuNmE9fQ~c!I0K)tal) zP!hk)`7kQ+E2zYGKS{;h-TF}RFFs}bd%tSpJ5cd&9yceDdYUL(-o^@hEU zwtd5lKo%8$;+yW!qSABqkF0+D%in6)t7!4NTrKF{@0oH{RO-zvnQ|S^iJi|xQR#RD zm8lxFzi%=owLYG){+S9`9IMMFAoMS$qVx|ltN8XckG6DtjS6pcLJu+wN; zsbOc(>+?Mm5HyoO4Tqdzef6@F$a8d)8^Zj5nZN=nXIMlB z3CL|2JFoRvriAw5pZ*P(7n=Vqmk}Cy*~HJFLz4eHQ{NOSL*4%0QV$>7L1X04S0$gn zzbTNAo|d#0GOMDGw`7KXkMjC`T9o`*t%Xb#l{wr%{ZzD3Gxj=aVW%6$9zng>t4*hL z{0i#9uC*9@6IC{jq|Z2p(b~7M&v0$DiamFx@mJ60ePKG%-fr^!=mz;EbQ8^;W8zD@ zPno-5hlx+4vZhwfH~z#0#;#q+@XMU|FESN+(M2k<&=aWmm(VtJ8Er>pYQz?Qhii+< zh=_jye{8R*XBCxt;uo9v;3cMfCo29Bsw)b0ng(*0u|mm6U(Q*f8$na?Ix0J1^tGm< ziI9nJ3!C^ND)E(TqyaXXeWsydRBu#WrbBH9jIFv&z10Ie|IJgdMurTnhYUGGFDeE1 zqB-(|XdVrra;BZAeh&zp#-BoG(2h4q`D2+BD*o>4Oney4Vt1k9??Giz&f6rU;T2J? zfBWL?(6lJVpDJiUBea?|0XIUQgQvE+l_zuF!N9T zQlBZ8yu+;rl?JC#si!?|I+Q@m&)76XLV7%}m5*McD+EsW>mFvtj=1?+*_d3rr1fZ~ zZP4UL&^+aO?lv9lMswH`u04iIJCkS@f8k!*ut|6mrhq~vqs_Iu)4Yv}-Gj=a8q>;W zJ5U+Y4OB*``@@pY=leyU;17;6V)G$ZbcpifA2IPIRA0UyrK6|wK?hVjQm$hPEj;1P z1U_y8Lul|Bo{TCo>%ny7{PdWXiP*sF(N?=+TRx%w#@kPAu>#6Q4rALc_I5 z(}C>6r!KMAzQtKT%v`xDbGwEuXEOAeQ}z?DkPj4or_%cDl;@u0+XKgc@__HluXHvZ z@a=8-zl;m{3;g2`eZ<#1?s|v#y?5(w`AauhZ48sTn zRI6&cVz_CF^h%v{qg8dn3Gkn4RdEO?zq;>|V@)a2sV1K)z_QTnQ|&u^YG{YgR_h+0 zszC8HpIU?xRG=GiI}B| zP=StHEYpBiU6+Me;Lbx18b4|Z6Wx3oV?qbHoNNjko)(=SQ7>jyf$`#kINJ)J+4 zv`@Xz=}`yaS$GcQ2>N}h+kgDN^Q}1TR(jdnST7*^DfTi6&W!*wYAp*TH0^^W_ zMJT~ckWo2#zY;@buCFXomoFLI}E{2VxM11dKo$#$W;_Aqz7w3-hqhYco|OO0WhS zpl)YjLkDz12)ZBwQRsy@3_}V=VGJf=8gh__S(t|fSb`N;g)*$cI#g_aY``XX4s&+U z1_1~{2)ZE(y)X#Fkb+Ujz&K1o7G@w1b5MjuSb;KBpbF|OOd0sWZs$h;Iw1@Zh`}Hv zU<5KS0a?gF0p?)=mY@V>SceU$f?~~B&<+6zLKwOs3cV2TWB!LpBw+;7Fa{Ga1v4-U zb5Mi@7T?wpWij$SE&5o-FaNnjHjOfA*%IWa>|<2jm<>0^kH1EmZ}{ZkA=-bZDJ*+T zC+QZQ!QUm3W_!^`)Q!nt#YyjSD@~AYb<=4#hLEPotu%ST{XV^KNkw`5mK$Fr{Yw2j zK*sV`0k*m_>8)eK^#^cx-fB83Tan*QD>8zlPjf4ckUrf_$4L9#^uo11b%vYX%gDXT zP1|7%zTi5sE_nEe{~;o?fS>J>QSnBiu6{)Gh6~kKgaN%>!#cBceov0qU?EYI)I;l zXQPia{tMjrY!}ymz%_ydZ5`uOdZBA{Jj*-?2&D*3(`^;86h)=SI-Mkbk3oz~4 zW3D}c$}#1tHk@x=Y@ORYc!%uBd+a3Za_g8O&4#Y>XaVM7@G7<#iqE=L6kYX`*A4;L z3!M;z5QL!{qAS-o+IR zBanq<@ZL=)AOcA!LJ2%?!w*qNK^A6V5lT>jwjrBd4^bF~Aq}&z0#)dEJ6jVBK^ls% z2G%`P4!w|sDVT>f@V|pmfFT%%0+hi&%*jC#vQUIon;$g@-%A0=Kn|8*4XV&_ADx6G z6krv+@1zlk!6-~Y9u{C3st`(02Bu*F*1_`^)CIke0DFQTb5MjTgx}>;eJ~7Tkb^}i zLk%MDW+Y$^)}aP%NzM#1P=pO=yPx|b3_%taL5{KCXi=A4=27Qe<~jaXoz|;%@a@YF zy~c9Ri1R2#wrq=X4UQdgt_fOivkzG+xs&DQVfIP;(I6CfF5TEcH_>&}i!VmpGIol% zPV4}L@r5MqW#%DFx(YMIq+ye^pBN8Rsb>-2#4h59wZNX0Mj%VTEcCQ62{7^s#);q< zoS@Mq7^kumkcUy2p`s}t)L!`cQyz8oPj{)$mUpR#f4oc8Ug!Mi zwbn)UotBy!u~Zb|gvB5VJuoIdNWmx!K^Dd#0p$lQweWrhmURAp1__-6K1Z)2l<$HN zbb=h-U$&~!(N;C{o>rBGNjRRq!kTMt-iSL_zMcm|^iI)g-1(5JkD_O&{y%L?&8o?} z!#VhR>($Mw-}!}v4PWZCUTO3SR}YIOE_FWW>gQbjrl_~y`KhSC-)XrDZSQw37Y+0~ zx4Qb_Yk58$zsjHcoMkb(bk3gUh#S!z#@W;FToR@(R2Lf(XRP0O zk2w2W=aM)F`iBLmL< zE&@geoV#76z;0)u%knm72AnV9v@-+S=l+%6o))v5`afzOpk!&yb6eRun$=G4#;!k) zDQt_`@&AZ9b*f2r9_Q27T05QCwVc=FfSLT17}MI=)0}gi^WvO!ja4z`2lV`YW)&NYpywJc{nwEB8P5ko$PAq|MVMyA&J6ozDIP*5Q^s z>7<*UBb{>7u|LLKe|4IhtrQ;9YHqet-2}EgGfbjV^%N@4o$?AR!)v&+?O2TM28Kb7 ze88t}f6Ua7AiZ_W@A*@@u+6_nFSd@QZRv?^F)Q2rtJ~6RhjT^4l8Q+~%)s zOMBf8OL_mcbaq?A9r*9(E5qdoZi@;3G2N+DH^!$vZ_apVTj8ER2JtK&IR5AxtdH}> zq>nh4M6B~$#y-Z|A|G?!9I^K7lHE=SIUk5v7qrR-C?uoKqu4vSPecwl&qS|{yPW3h WwOnrGFmm?YmMZDfH9y?PN%g-v`prE6 delta 67779 zcmZ6U4O~=J`uNW|_s$E5ibOc#L<6Fsp`nqBhK5KJZ@Fk>sH~C^qM?$phQ>ORvPMQo z40*^{BbQp%{280(B_ku(j9k`5*KJuNmyC4Y=dR0N$q4_?86Io4KA+R~d7g9bx##6M zFZbL#So`VB+TYHsji&QoiAkM*a#RQrK||obP(mmmWnu_PxqBp(#1N7{l@Kk)Ghh|O zRoySbEBj|pYd924ZiS@zgoq$RDiM}^TUPi}ZZ<~EmX@Mm5)M&eL`UL*B?0>Vpm(l@ z?4T4PJ{|P>fcVEiuPg{AwHRLrx)t<#)Sm{upfH#epuP=sBjjfSiu`sFV*8AaV|klw z7id&i!KPrM$Ebe-2Xbw}Y(6Vlb2yXDr|i>5Y`diStou>hX=z(n{IXz@f{A}E5z@2D zwp>2L+KX({G#kPamj;uA&^F>OpFzzlRvkW|IV7>XBHQwysF*&uxCEm^g9#~K^;09< z33~i0+uop!^ln>l@Jf~#WLp=!h{?saW5H3ZPiMOj{P8$w@({TDI5LWagV3Y19T*j5 z8PJXLxAQm>O!B~7J{(L8{$#;q4d`tUFKQ^_w}PGx@z9xmeK+W>5U)dhKj_^7`MnQ% zc+e<+;_8#wA?r}KMz6~=nbehfF2V3lm0H~%>nu; z(39bAx6*$FdTl_<`#{eQXm~v|U15MeDU4W1JzQ|BCl*42&VZg+3i^4_Z?(-@(e}_5 zx6)UEE`vS+3)~F433Lg}ry*Vb*ik?F>`S1Rfi7kn5&stG&0|K9D%3v!J$|gsIc|}q3e304zY<)k8uVMP zUj}*}WFw{^kxeb=`p{7%4)s5ReiHP#sGk5mAt3%s&>I8v0nn|W8!`Rp2)K1#sO?zT zN;Wjs7J1tY);RWX=50|DtAE$FE@B@`4zmT1U&-WUwsqsru=rftT*C}Ta}TdGjM0XL z!;~b7;-BIsL5@TKG(J@c4e3$ zU=G;;gZf{O9KI2|k223Aw!KrLSdVo0*p%axB`mdVjXT3Ci*1=xYr;|jE?RdFA-Ss# zJEz{H;}(ZtV}+6MS%kzcK78Zu8I-lHu}z!qWl3ufN6t9I!a`t{6Kx<$lA8ph{ZZSE zghTB1+`|WE&X-tUp3RxKh-C*Ko_pWN8an&%jk$M1rEQV(-i804o_CK0Zd2^<`Y`ft zCe$q8wr!xN2k0H3rv>PppjYPpRLpar=Yg(=@{EHL^?_a*pgTb?56}leUtJJj&}G4V z2KtDTgpu1puL{s7fnFSt{$9}g9trU;O5%odK<`)*LQGhp6wu>xZ5QU>8U66BT*r}o zuvQ1;S^|14G|2<29uKh2$%X+HP}v7TPXhf`{kYQB8;PGcr3wq#s zz7O;gh%dwVgFn+h06lP2ehPZvl70sCz)||oB^Hob3*F95v);`8x>EYN#EZ$do}^zwpW z(g@BiTCfQ8z@64c&;xg3<)8;nYnGj02;6$rfu0N=T8%eo`kDTJpa**J2cQRT1y6t; zxV8HW=p9e~)MbDFU)@5!6d6E*61?FzKhs^HcSC$3#t-~VANrXt=Yo^LUR3NiM2p-G zdRoBVbn^e`c{*|*7_0#a(m-zq{Z^AN0^IrBz!Dnre4@HIbt^cX`--0=NgDvx+8EkNiZRp>3>(072I7@#+UZVk}eLGKCB&x79a>z|$yz5%@x^k!_#n@eG)DcStf)HMMfQK~?{ z)q2xG9{}Bi7i5DT{w%ELsQ(Id1L(Kne+_yP=p7he4f-JH;ixx(?uG)!qW-~SFw+DY zx*KLL6I_?I5_e}SG9psSz<`s)v%AA|)~{1;08b2CC-_P>Tu&>LY#lA%0j zxjTf6hji&kv1t~14l);+hs;M7AjPIyq%TI6AjMOUu$Ljrk(EgCfGpxI)hN^=>yZt} zMq~@}AhH#C1lfjcM|L1jB0G^?$nFtZ$aysMAbXL0$bO_9=|-x^0puWZ2ub2cYA7T1 z$Pi?xpWvI}XfPm+NE0#^nS#tk<{|Tu#mEw5DY6V%jydr3Jph)ZODG49T`4- zq(%m08ZsT3jm$yjBJ+?1$U_(nP_8@zaeaL>K9qC4@(=8)SI)H{jvHX@smElA5j z6k3r-kZs6zWC!vjvJ=^b>_(nP_8@zaeaL>KeS{X`MuUnRKn@~@kmR0`u8@%-$Z(_q zX+)ZkvB)@NJTlo&7=I~fNJFM0vynN-Tx1?HA6bAbL>41Ukfq2nWI3`zP>g>o8mf`C z$a-W0vJu&YY(};s+mU_9exw~ah?EmXTG9ygkN-F{q#(1AdB}WZ0kRNTj4VNxBFm8F z$O@ztS&6Jl!0}&=hFWAjvH{tMY(h38TaX8lt;i$DHe@@p19=kJnSkTJ3k}`K^T-}# zFR~BWkF+D*NEJDN97GNw$xL)&q~3x;2r?8Ijx->RNE0#^8HbEVCLj}$Nyubm3Nj68 zNk<_YnS;zl<{|Tu1;|2VF|q_%iY!BxBP)DuiO3{mGBO332K0~rbTnimbC9{nJY+ty09lAEMwTE;k!8qo zWChZStel18zX}c2$XaAQvH{tMY(h38TaX8lt;i$DHe@@p19@^5j{iyO2Fd zOFs%KatNtU94T=)(u9mhCLz<1ImmouF|rJ4MOGslMra|;XlO;YBRi4jk$p%vau6xs zH&W10q!Ag1Ohl$2vypjz!uTshLn*QXS&6Jh)*~B{&B%kuBgk%KFLD4$W}^=vLj}e7 z4@ZLmX+)ZkvB)@N0x}7ij7&kMA=8oB$Q)!YG7sn<|M_SrKo%m4ktN7dWI3_|X+>5d ztB}>mT4X)40ogbk$A1$VnvpHYgUD9o5o8;(9od0AiR?snA-j?1kv+)X**O0D(9n;x zBi%?9Ie;8Q4k5`eM*3Dp>X9MHP-HmLfHYcAFd<`+amaXN0x}VqgiJ=JAk&cP$ZTW| zG8dVLwB(~ufGk87BTJB_$TDO(vI1#ERwApA)yP_8J+fhh7Sf1@CS)_R1$hwJiadgB zL$)J3kSCFy$S!0z@;tJ~PZ)o_Xy`-sBkf2xQbi6R2a!Wal7v2h)FVTXp~!HgK~Ri; zBN|M|SY#YB9+`klL?$7VktxVDWI8e%nS;zl<^e~>KN<>_m1UyOHOSJ;+|99qCTO@vov`06B;p zLXtTneH)H6AdN^9G8P$!j7KIQ6Ol>CWD5!@$TVa+G8>tL%thuQ^N|I}LS!+r1X+qK zLzW{g6)039tB}>mT4X)40ojObLN+5?kOz^i$Ro%$Wcvs$qyr5nk)6mcWH<6WvIp6V z>_he=?MOFLMGhbbkwbpM_>X9MHP-HmLfHWda$XH|?G9Hu@)*~B`jmRcsGqMGF5ZQ`6f^0*!BRh~Mk)0#7kS;WIBhMpykiE!0 zWIxi5bR$*d0CEsHge3DuDk>xOe!}<*K|?4q9BDurktSp;G7cG!Oh6_glaR^C6l5AQ zT~LhwY&7H`bCG$-d}IN#5Lt{YL6#!RkmbkECvk@d(1WFxW(*^F#K z9z?bxk09HS?Z^(~No40d9RFQt=tiDL_8@zaeaL>K9qC4@$N}UKlFY}xN9vIw^Ktx# zq9GhXCt9zk{>dyxIeA*4Qawq zN@Oka2(kyM3X1VRgof}3M{Z<7CLz<1`N(3V6``|$ zktxVLWFfKwS%qvu9z>o*b|dY`0c6NS*pkSEhj9ESqahbrfGkH=A{&t{$PQ!|(vEZ^ zRb)u|$c;mh;mCw^9RG=ENJ8c!^N{(-a%2V4iflwSA)ApM$dkxUWDl~>f`S`4gp@O| z#7Gk|4w;C|M&=<)krl{#WD~Ltc@k;qMZu0FnIkt2LB=8zkm<-=WC^kyS&M8$9zk{> zdy#!3w2*!@*pY6eiX1==B8QM9YowquQjZKlh9bj}2BZ;b^7BscdNd> zj6@0rCrPAKFjgW}f|DguFE~XaO@enxWDpkr+h2$Duq58TOePhAYh+RljDYVlYX#xs z!FoX>d{!zL4PWL6#=^whB!tQEiJRbkgtQ3$0zTytgwG#a1q%r|A_za^XcK%1KA{t= zhR?DDYv7wP!5x%z3Lc}ROYlQVx&=R{k1>)X_$ec~f^J6g1U-!8KL86ugqM*5VF-sl6a*(K76d0M5dD%3>SC4;Nx_A^Jr*>a@|8(`G(*Lx7drK}3kPASz+=|Exh{y?$(*xv`068f@ zP6&|W0%TKwYzUA;f08Yi^gk(=NPs-3{#0r;K(+_SeF1V$fZQD*cLvBE0dkvPwv04U zYd}OxfZP-yHw4JF0diGgh1LV>Gxi~;BummW10dh`&oE{*j1jtDNazcO{7a*Gg zWJ7=)in2vGh&~{KpnL@EgEvRIK*(xBtRY<4(Nga*&ZPG1;{-Ca(95-86bBA$dKhtyRU!vrqjAe{Mrd@WOv^;pqGNKzQ4-6)cmkH&8!}Isg=GW z*EN{QG}61TkJ#S*&*Z3VCyAro9kH=mIy>~!mp%D;8x*z}AWK=S^devdubu}*ZhJ(bJ&L$WS zWOycU^$UmX#MNw>tA`x6u>Z#DJAEXYs|Ovn`JlY#SN1q;>p*GrD^`bX&wpo!Z}Te! zkk!O)5-o0RcG&v35jZ z&0LLf*op??!i*4Mf}$IcS3S0-0Yg~7M`vUWd(=Z-YD|LYE}!l6Kn&dGxZS1>7{c2< zGmCX#Kjs;Zeo-Ss&Zq2$jsNcwV_Us-gM)0?W4kfkQJ z;qdo2Z^o;KBYzmuuxU84J~6weM*&vhR|6FKK(~}W2A2I zAG4zC)i~evD|)&hvWnb@ev#CxCZF$0kJdW&`4qjeGdcq-1|L7ArPaaRP~QY&J+zmS zucHwxIM>|=6?|2F+s!M2=mL7Lay*F6r6I#^t#PI?q@i9NbSv^Gx`bI?Rrfd+n&o>N z%<}zOBl}C7@m00jK@&B`><7Kj^HV{8+bJ8fp#Ad}zN%(AMbp2k7Q2L#y{eWu#%4es zP7*DHwa`uC!q1wZEXO@RTtDx7RSk8Fv^AuJINCiN+YGK5(q3zR%{=%K;r|#-UoiGL zf?l)24PFHY70F*!gB{eE0(zX2mye+*Sc8ivkEOGgyb3P!jydQxjlU3Y`!k3B@VoM> z>OKc+)EG4lxoOR0%sY23-u__DSFftc4pE6(hsKx#InIWhMJ02cbKw80?Yv_wHM3bR zt`DX2BVSdc>|+juF8<)kt7^Yp3snoZYvBLH$qPd1k}16~N(#@eN~ekR)Wd0M502}u zfo6y+epMZ|e-<62)j^(iC+~s0*a7JAanv~GRkicSxFb%UI*#sPnNEITB(C_!xFt?S z4xJRDZBIzC&lSWO7n^PBW*^r_&~>KskOEu{94N(HuQHF- zoco~ftjK=>=yor!i=dhFn><6%EA{GC|G=qNyWAhF)fn|OCDKab66)1j$R!T)x#X!= zFZdc`mXOY9jWL6CFSDs$FOM8g*UityJoaH8<&cLQkVi7)AsnX$^4N)a$e71oFF!G! zK5+Y8qsZX%vFOM+L7`D>O=Nf1xOY-uaf?-~M&Om3d8y?Ok(Ai-Z zJ*<`*XT^~Vv!Z#rP5r{7WKN*hXyOGo?ES;x_azHc%^kPFZc?5I8hfoH(gyWhXg+C( zm19E=K0hXU!^*Mg>@}O(;a18f(rU_XxH!9m=CE!T&%J}Tu`(AQ6GdmSXu30t*VrW=Z@G)6vF)F6?_Kl^GkW>)I9kZ;9v(ZDPGO-wo;{Vu zk2&t?j4ppNgXEa&)HV;_Ih9tkQV$QGMpIa|S6MoZcGIXLcf+J_^Uh>Zw_s#1o z22;(U)Cx{ir{=kNT0AXHHn}y%o7e9cj5JQT(wg~h)}@@`>q9piV}?yuE5-n;l$@hbo+R@$BhzXH>20>exSzZ%utLG28>DR>ZOx&&jLI^a}7 z?x7c1NS#{gpF8SQt3$aqldh(0@I$`*UK%GurR{v@y|fs*=D34jy_c>8TaKOQCPH`B zsq^i;Jds`tJ>h#kEyMiy>K3AjenoBf@oo2kzwGz%_wJ*!*-oG0ypI-AcFxOlenGzo zDe^vV&M+4jzoM@8@>xlAkePfueh#>Gmxt%ip^I3bmmi!%BiSAg@0>$dvqBG#pG!?F z*TWYHrh0hkT!>Hf@annL5Zmsmo-_nw^F>Y1vN|=`EoR#~wI8Y)2UTV6Iu!;?=UnKu zYi=HVKjgUiFyC-L+?jcJaWb6}Vswdl&I%2s_wc>RbYl?w_+EnF{S*-%Hjlo|=~{{XH&}Tflg?yg zQr&VCIs09o$@q#|?oqa7(_M^RbMoL{(!TIY=by(;G^_}1Fn?4s=czbjty|`a<?2lS3b1o?t$9G5tWURqGu}*J4`5KwMk}}yc-)o&TQR_9pX6)*Bto&8D~O<<$1F( zIQ@oGc*}m*$Sqsoma%7}{k6E@ngTZyY0vx9w&KmsxWvu&_-{7(Y!uvVKkQ9LTBHuH zpL}*ghG>O7&X2B$0^NT0_C+H(w!&R@kA=M*i5u5F&R?!dC*S%MZD#A-+*CrB-AKl zRRb;J2m`yYYF#63)92u)N@!I`5fopny1Ug%7hk@f-dNr55N$f!xm%s#USO7I#uZeV z%&8jLefpJRV#;xOn;`kRkPg+{!pVfxF5)x zmsz74T-UE3_137DVdF3Qel|QA2p^Aw-Dr^ZN&l#s3`G=fmkpCOi~^`!in&JJ?4FS| z9dfwlynekNaySV&h~itqF$=BB8WW~{)1Spi9fR#3ZiqJ0Kbwpn7Qra+XBXp`o~?mB zbZ(j`)}SxC7_#(D1^KEEw&~D+p^nK=kumyd7EHP#;g`%1+93Q5&LtD!m~~LSc{6F~=1s4K$2?8jp|7IsTB8Yi zBGUdmtP3!>?c2=-z%b~K9sZ%S+MZ$7`bX+Y=mk;AB;R7N7E+3(mU1oyx9cWN{^*Rt?({@l9$6p?e-mJ zt0}ppW7%ReJvsPPav^l8H^V&mM0rV#dfYp-elYs;2}L;->&pvwW|*L#j(BGpg%_TH zCsERUpRr>7^Xc0&CKt4?k2Q$+{Se;-9=sc7e32s2xWF7zAXdmYXpcQ!I>+xw0JyoNYdwKmcG{$lae6QPGqlWmm+coM+ zNIl};%e`87-nx8tJ3LI(sAawywb_}Gl@9(I>j003Yp%lfSorHK`!8YM69ZEev8_+| zdX1V7ZdRk_`NZ%nhda*l9$#PnggN~_!_;CCpA8)@PlU@N{g>sNhu614>r^jJS|1J3 zk+9z=qXu(`H|fcw4Pi+&>UrD4B#xc6#`Z63fAaW- z^7S*}F4sKPn6t}dDBC57UD9ohNh~lK&%#yhpwqKNt^OftV7KqJN8>*JfBrOQ(yDuT#gyRzrSJ+)!ca50-fd&6Us z3&k*KvR}V`!3o~uQlWsw9yRNDiN@&5Q1fbFA28J@s+H~Iq2)Au#tiqdk}sxgrmw*L zvppY{e3M}{H!K!iYl4*EQ&1L7cU$oSXu?Q0Us6uz21h|TMW+tB_?~iVT7KR$RNPh+ z6i<7Jdh`3Fr#^wZp7zMbk240rHBNbw)(o5GY!q`uw@Wyl>YfAhLzioKt&)-O_~1Gi zq@JT$?NG+|Gy0&6eOSgrULLxIhO={SK4lA?om=nP4>`Z*eltsJd^7!$VUl4&aVT_F zmFo?#zwI8G_^hr*7?)4LlJ+`S8_~Mi^;)LZ_*%yA!Cs@*qP5t?TenaH%Xjh9Tj-SV z61cP0*tWJtEy5HjE-r7SW>chV1LT$ro0<{l3v)dMdLpC}&Narx*KMUUQm#3F1=eu) zlB~8hOVXVNIq_~Rquu!!#ONWtm>c_?4`;Nkc?!z17_39EEfCi8aD=dxE(s>S>(?V3 zHR=*4PkfFp4XtpB2k{zpFU*iHtEatu?{l!zY~8|7KSv+2d=g9`scR%jyt5i+@3u9r z4AF(@{{2Rcn&z^@ZK9krq3cD*opgK)dJ@KlIzND$&+_kBKJa&w9&Z?jDVVb=);g(Y zao^&e3{g;{%Q~fe%@-LeI8C@#YzfcKyL5Ge7{cSAb?pw`T0tW$D!4Z}JO0wJDH=9H zP(R0~&@-Q=zi8eL4>)27cR9o^4JK}X%Av2tdR}b!7z%gE*;X`c+Tt%xn?p?DHL!t< zHYOF7`&$a;*bOi$VCRrjB&vGaUpWyc_K(8SWO0ZUP~4{72@@f#jx}l#d8MuByQ1<9FRMMCV;jET09{~*nFV%2J79@=S?xk=Fj$RHpzV-bt#i%> z+3x_=P&9BF$W`1r$MrI}Ge7wo8U=G@j$>C=8x-pt+i9Qf}KjAv7otCUE z8xoCbj2Cp)KK zJ^Mp++eYDV=Ut)$zwS$h31lG81kN|claMtdvu(|HYiiUfup->uo$~PJWpuXKag;;t zg$vZE_0AiUtmYMDUn==3%J@M^QkFGl(c&|C-C~ciP~Bs{51v-UxP|PyXFa?CHj<(* zN5*fJJsxhS=Cp16Zev7dMApmdd6#g|p9FUeoSTL1C-eOSW= zxHAbhO?teo;O6yR-xWaJj6dEt z$#F+^N=Wp$c^@83=rR{J6${wOT$qQh&vhODDRV|*@c zATQc)X~E4c)Do7|I7rjvi-uWKNPB;b+XQi-N_#Wb8 z_=)L);D1hGZ41E ztJLhKV}s~h%Dx6TbO_oXOpbs(Y`=1OFa0m0zgI5&j($&r_QBuIfl&QB<>DLEM(I9f z)0^~JiN2;dn_&fDUau1Q2fBk&uIy`p6BpLzQY!zCZlQEHA3g-zu@g?d3I2W?+vDV& z@6yd|xs#`~!T}K+M6Yh82iaBdmOs&1@-8@iRLp;(LsW=%#eA5aqhq|j_rMe9tnXa= z66{EoDeuz{DSeS&I!d>((_Uq@LesSDZ5MC(i2jRiSGFIgk4V&_eDMk0sHNqKxsxVQ zcFe=ApTluiua_VBoIXs;6n2^p(1>Fm$w=@2T}q32dbrr^SEg6N%L2aAL6doU7aY!S z@bKC$YBH?!(4-yafyu6AzcMdPe_!eVV^~&l&+OJ_t zT|9l5u45*ba%7msQg+(KFWiI!FsoA+_3N*aj62n%u6mZlRyyCG%Uq1k;}iJ_6DW*@%eg|%I13cc0HTIrg-@QJey_zQWe5Jp>(E_Ifjjqbu!e%A(x-SUg(@`fxmp(KA>b zV(w$LtjVVwxsQRL%0BL$&BoIpMf(f(x;&Z%lRg?udLW#j{LuX@ft~UwefP6!8kP;4 z@pGD`k>u>jz69fTc(2B{&0~@5kc;n~$Ck2S55GE(WwL%ZpFbb|D(y*#pU-Bo18)A_ ze6}ZSB|L+i(=3Vn`}k1}(Qy4zH(&k$>t=2jk56IA?2JoUm%?^Z`FS{W<^u~@kIwp6 zJ;@si;(t$N(d_NNDA!V1GaGXdp1-XoD}()&`ZgTeKgedYT|VCTATzP8KF-Wgh+H3E zWM)0#m%TNUR=`_Dt#L5tMPh<0F2dWBhrpdWyh_n4W2S z-q(|(?<++sm_cU8oZPyKy*c_~5E=d|h}a>V_=?XcU=!(AJhOm3dkdoF6Y^b0v$K#8*Ae#)n(f zct>DyegkJg!pS3{00j;XZeGDVe$9G8EwU?3n^`_(6<$94EDXtJpE9MKl~D#S;2O8E z8LZpOySA`KX7<3-1&q}QkJ7o7X{DGmZvUiJkYa8~*$$hhZR#mliGB;~&kpm8>A&&6 z==crfc+IUCwlN2#A9H4f@^5x=tCbnT*13KvV=Sbf;v6aCHg%s%>9n$ND*pyH#Y*4v z>_J99DijVY61%8r2xXWMyIC2N+psZrcwWfQ`}{-^d8>KUy4 zHu0iH)IKz@Gq>SfwNzQYldYhz{2qNl{PkwyRjyUB9=%+l);W}Oudr(j#Be+Bs)u1X zPF4;>5dO3) z%X2I3`&bN{Yj)|+(4^<#FgU*3VBBl20LMy=dW1Z;`Vq4&CDQoi_)y~nV}-iSm1SOT z-kb9G@p|Lu<118XL&KYF8Exd7-UKfUbMZ@Wvekxqr&wcV8edBp9lqUMW&R+2s_{8E zja2gYvsB7vIhCg0v)3ry%jdquDxfw~9Llk`*xM9Dcw8k{V{WM-Xs?|7e996-P~F_(bn{~K-^V|v9&$^s z%MJ1aucf3I7MO#Jawh5;;M57ab2C8UpmCL zhUdDY-mscFG%1mvjQ@W8TxgC=xZ}Glp4nY|-@DLryPUia7;)7#xj2I?F=tF*C*zEc z@hjjSkY;u(n;UW3HSbp3XSd=yT*}T?wu!NwPUXgX%u1oICmnp-5tweCQ;$0M-XrWZ zNX(%Wzt1ux`hxOa8+)D7-||Jrz=Jk>x%HSB2LUY?o4!Szf-SdI-RD(0kFn_z-Jx9j zh~-K07WJHiFZqOJgLD9P)}OEgAT4nyIVZrIKuobKn@_R>Ox`L!^@#n9^-eqL6T{SM zej~*--fp-vIxl6HS?Y%CePY|V#4I+9#p&g05$q*8VI92adtc7(tdC}j1o198$H*2J0`kb@n+2P_ua*KMwsT@1Y z;I;IP1^m)E=8i2_>)`D9P-=efGsdUMa`NJYa|Uh0sMq!Cw}Y+HGIP1Q-L72yJDa9s zab6zvB^zcf9wn-maYhUH(Ti*w>vQti{{~McZe_#2Stpa9RaZLr+)Jz$q;Lm6bBX1F zblJ|QeZ$UywAFsJpA|Cg`RRI69j~mr%>FIWN0m$`eB&HWJasN#(81FB7 zR)tB<>S6HyRPRzQdzn_x8l$-P8hc?}A}ltMdP397(@G!eFgqiao!8lTNs|^t8Xn~P zZ$M#TMc)5C>t~xhO4Sg9?buwGVjX6b(Ffk&F2Uxk%*l5$>5}|w^(4G}m!y1<>R}?1 zqz^Rr2a$sozDt(AV5fb`YK>%*=xpVhURuRyqOx?96bJdaxiv(pV%JB_D>jPWbj7eaY$qGa(9@tl7%{z?AxS8j2-kTjkinC1k>O#HA311 z6^^&_i1AVp8}upL#!DwD3x^K~CP*!?bKu?y(m_@XD^{fRo;az?nJ5jgv^9#^D0R?; zI*)i=U#8Z;S%Ek;Wj`!4cak!7SiJHnXh?_h!{d56lq1_&rlz@-*-=s^W7;Q`?IsD9 z!eh=q3`R@Rl9RCGBGKK8%V5P2hXKEaqoU4bGcul0k9r25Tw(4Z&%om-*cML%TetVS z_0Onhyk+pI*Z0M-268t!d$3TpusRutK*IA1_^`;;WC48Iq9|x%jjhQYkBT^Trv{wa{nO za(fjVZ9fBFSUscGxRm|(NCP@{$g4E`LYg7L8V)borL|O2X5BAU`UiAdGTeTOM~Rsy zRcP5|2X9&^9g#P|0?hN$q{rEio`0Ao-2;{s2lu8)4Io{$E4v<)PBT{RQ z<_u{YW6ki9Mz$2oU{jfyE#Q3ZLoY4UbAr`Du95sO(%SJC)fM={AXeq-1WF?xva+Jvr9Gx9)&mb~==X9nu*p zZxCzBy5CBA9jovtW!2JIO8=yoYb2O|`ds|rZmAlc&iLF~@Re-XRniZ=EWaMn^>oZ_kJij=nj6WT`CjLn8_c3kL`2uV;@P+Q461aT$%-oma^_R)L1&^ z)$zMNk)}(bKArN|CsG)t&+&C9q=|r?C!}*yyhq1Z{8@5HDQ>?offt;Vnx(leouc~` z?ju396s7L( z(n?vo{C*uV-_Ni8Q<|lX1KD`LGNVU2O__DBa^qjpLsYX`PvYk&<}al>iACL~T(~Hm zkj5-Z)R8f>;E(*z)RBubl>?WgeT>0DU*@;c(ui3JIx-DH?>#zF3n3lCkr~R-Z>1b+ z?DN0lTcaLxM;D7#cnusti!WK%z?%*pb4A)9J{GIHA|2AOJ#L=rkz#{eoeCT;u2IDf z&LbU-Fu3X7k+U(DQVkzE(7H9M*~yc=Qj}@7^KN71{S%6xH>a4_z?YWddyX}#9%fx} z3$251^-77W*T5kudrfN$om$qb*}LxHr?T!bKDO|tL8NJR9EYRAHR=t=Zx&k3h3Rny zP3;KGE%uv` zr>H{%q+1A|v1DhgQS0CU#3vP5R)Q_$tT>>IgA|2!sHsbmCx*0KxjFg&G#_8N6_X1w zrT&=LufX>1US>OfB@RBHSp$1{5*PGi4ylkX-=D7e@~vxUgKdf5w)Zk@%xAJ!ny#?d zQl^sLZa!U=g2XURQei3z=~Py#(s3Ouj67;k%7!BeUNR`n2rKc`q=tVrHZwFU3BC;| zQVV_j;Gh)Amim;=L1|dWBAoAc%4@YWiOtC{Lk_UPnc%2Y4k6C}?t_IO9X{!;3n zd)e8L*Pc)PxF=-q=-0S!kVQzj&iMd-os$p+`!!IPr zH?+qtX-VBB<-$xka_o%i=indLvFJ$E4jrj^7Cd;Xj<~n!;7?!ch;zG+WP+X!dJgF3 z?Mlo7nMvXEE8$-NssgXB){%sA9f_;e5nhlXNAqQA@{Td*f2|{3TXba8Z**kk(~2`q zj?|1fSfL}$R>%;pon57bWy;4TcJM{z+{5ynn$a%Zj~lg19ej8RRO14l@`!wIgj)xv z0whlAW?9drtytgikp81481@?%?n{4MDa5_;h`d}2*Qwv>_{~S;I_-Blk_Ix* zl`|Ren_PKc6x{LXppLXasDjYr)R7Y4atN~rA&*DpqcI?7f;|Po#p^n90zxZ8R)(SgB^ki^bTMZ1pWJ>{1cxNHwCW)ITwOy8(+Ch-X3N>rX|ZEw1LeGtlg$uT_zu*nFh~e ziyr$WnL4&R!8pgbG;?XzDmB7K=Z!V)h2x$K5@%ec!fU#zzO$)j_y?L^qqE4_)=R74 zeg7=u{SZfdS(#aI&5&1du8?QYC_UWa8uVRPS69ei;D24iH*e-c_sxZH=s7<7S8_eO z;8D7MCAYxSx|?raC7Z^0UBNMAlGZrZSRkI>npepQmM`FAv$4jz((lR;yV6zau-m^o zg;K79Q#04)npD!g{>rQjvJa%5;wwd0$dvcY_rfTSh+d^$bVe?k2=yA{uh)dkNGM~k zo6Zw?j-)u_BvaM1WP0#dqUN_{-j)?HZQP4KP$ zW^gJ9>;9xnD}({Xc5UJ_*2ue9jh7!=BS*3m8~GP&-@s9qbFI9M&4zQNb@I~KV2^9v zi)5K$RB>af+&#fKac$_##?%{UC#_8|jw<-$x|QOG0{!b`_(E~EM+sgp-^FM*pS4jg zNiaLp;A4}PsFiA}Q=c{2_~63l(i)0q!XByO-I3hN zIHGP5{3{#~j{j0ej=iWQ2Ot!`$el0AdtwG((2@%f`XGoWRZ?8~(6m4OG#Q?FLD^p| z!~UZ>i*KlrFSCdp%F^BPQ#7a@>J8z{hx~M{{0q&8I#Tx`*S;cO*3N!NN0J{>uD&8K zr@_6>yR)Wd=BuR+zSJfsCB(VJySlN)eD$dF)?mzsFXkd)c;&094)M-a3^08|&};cB zoNpbs$tHvNYIJmixT+@LDwBg>v&k{R0}n#wAPlD|G4=8W8ucqxh5f+JD{yueWm~Z9 zO8Yx7VM54(FnE9;dR6|6D$3?PupEZ;dc;Q;NyZiM-j7=wkEh&_de z?Uh?tvzNE;mCelR(+AK}x3N_WmJKvIH#G3AYORm?ifY;FZ@T(49 z^|qYD#`yS!w`F4{eA;&3J7mR#Nc#6kt z%i&iHf-BVJkk0`)I~i-NxVMtlOX1*M{8_( z+yv4!pYSBHv_J!RMGgrx0lx50GAxG!@SozgdTGqmNm6Uv>b0w#x#~w`hK)b+M|mx@ zdnlBvz{gMiQT{*8tFQ|HUfFy=Zqu?Dmy+-&`2%{#>##&a*!!}Mv;k*8XoTS1t0VcZ zbM0aI9Sv+oPVMDK55tsF%`Y96pBM+0qp#^m1BBj}bfn@nW$6*vS=<5gwFa<3Fn}Bk z;cbxXAuMlD_P#Iwncl&5WXg-MZvYaz^_KJ?Y2Vok>k#&52yZj7&T6wQsuA=PB z4W4`)X1r??mB^3fI*q&xKJVnMC*^Q<1b*>#5}fp!k2_Dw@#37&@F{F$V8dDXsoXCw zS9|R|`;`0^U87t*B|CL&Mi9SvMt+H|=j;9^-%Hmk`~D`!!U>s^pX`?FW%$`F988~; ztyG(+hvh^MhkbG_m|Nix^Y8Lf3L>-f?(_1ZxC_1)%}cMa;^hTpSIDH-%sE$N8Ac^6 zj^`oNKBXg_40?bm=6}dJT9)Zn8oq>Shi&>^8R(VQGAP5!f6Hrv*6YZu@3rK72|x31 zIVL2^ITgM~PcS~FmN@y)zhSL`6R?;*sAam7oBQO$bQQnRCqF?~@|9o9j{%N;Ew2EC zU6K>z$6&ZA=1X#^1SP!)hvwfx+;S!2vV50BALH{K^1k3MuWbAV7NONkmCFvfotYj} z7r}=Rli`PIQ;h}~3y;B2( z&cE=;)8OX2>|FN3v^T>eTz#nuXDl5~0`I>fR zzm}X>u8bLwQ#CBb%a?p7ub!BAQA>7xttDGwL*5T8y{skYAmQW#!svO^PJvs!x2U%pIrM!GdPM;GPak#B}AlwgA3~N%0umw_DKV%FAc1{#uy?QIp$`J z9L5-j9LLbe7-byB7(Mdlxy=D2gy*~VKHYQA{d<1T=lSD0-*sK!-Fdjt6gfRY~j`V=e zgPK7HL8YKgpfpe@$PF~+hfxZ21#}*C3RDl;4JrWnP1X?)P>Z$2I`Y{HtJT_a;)`Y7 z{v`0#`2Fc5WckG_`LfNQJIk^+aA0dTH%pj_u@6)X$^)$eg@UR>Ff^IbBS9%}2ZPL@ z0`O}=xFLgBfsvq`NqAHN&Ih*7L5|=)3@n3H3t*|}8Hf`U2$>^TM|{j0vyZLQ=eO&b zMN->r?&awa2Kr(k0})UgMv?}-lorIraXxrd2$uE-aYtvvxe62wiU6HO+u(ss-dM_l z22H4WlbrVuw~KR+Xh2(@fN{yhMAlK6eV7Z;uxEPYd0`y>tBF4%MLf!tXw#gN?@6a0 z<@`JgzUN33C>j(DTJ*g%_$arR9s5YyF`tWN!w03a^SLb6cwGuvz-?o(G4b#MuAI%m zqkT9R;g;sS?nrZ%45ZP2OSXk`3nsgv8Y(kyE&{IlhkPoW3uHXfobBqKUHo63<_znT z1|Q@0vxlxpyCS%R;Ktj*wp<$%uwjd)(fNhN{^RgtKhjYl4SW6Q#20MOwWm4{-j+ur zIBYYhxh;oBa`-%>9)tHn?qxDN*Je4!pKg|$+D1@6o}wIPtY*$@nW_d!nGcV$w&i!0S~K?7YZUXxZ?IjnRHaeUqKqzE={9pwZbcg%xu7=0Zgy z?0;yq^gf0EN0OZt{c=+w9ahGArIr`Dbl&+PN7_D={5El0jC*_OUl}X#tRo%V#O>84 z(|tJMFL8H*ld*3jEG(=r8+%ccofjNTgqzu@xK}H3)qTbxCvWCL7+*^bN4yd7wnH4T zfiCCs#O(u)4Aw|{U*>kOE$>U-TexI?=ldKfe_zhq!YyJVlCi<~xx}1h%bl^eSD%NYa)4GH0!^8hI`pmL5iOD!FpP85mtuSceknojPxJEv8dwKJ699E!aqqUsuJWltiUTpXWnMbZ1`3IGKuG^Te&bJ{*Z?~lAR|9CH*!IFYpKDh;4Kf>$xT6yvoIT_ug7!{j0n^+^yx#vO)LqDSO=aPdT6_xf>0Ev`@WmF;m$bWXNNHKA|jk_6(}n zJxDX?Q8jhna;J76nLZtc6>}-xr>i;Acz`2y&tRbfS_^8}FIN_GCWgJZth#OEHZI9J=c)95C!O0nwy$& z52P2-FZqgGjOomO%M)(~CI&8ZubP6QHDSk!h~8e?FI9sIt>r6HdD1OQLvL~gd`KEk3S>ENC%0e2-WrqnUEG5#SIZMUw$hYw zJK1-Bl#Z8iSG3FNtMBq(aetZK$P>R`a-aL|=$k~B|B>%- z=^;ra99jJ)N9KXMK`EdpP$=m78ywm7hIHy3EDl-*r15vSN4Po+$!|#WD`5Ja z1Jb4nZZQ>A?~smHa4&El@nrrEDd=79AnY*WklNnmGL7e7=SU5xhBssB)+oJI;evKo?U&S~F^p56<7IX&uu2->N ziQC2cCV@G;jU)3xq-)Z|Y9J5t*oqBIT!F9uR*qO*RPhr0;C2*YL@AQ^_qhmcMUkW5 z=UCR$_zD&hps!DkPdpB?Y?0DF;L?nzw{T<{KGNN?MLPBYcgR=--UF1lMOuA`8)b7| zmREhq`7xgTn~{S8j+6lx{&sv~Xrr{V7VC=Am#~M4t7PwNk}v&+E2345y>-bSaVI^~ z^I_{v9QnR+eB!_-;}czlvUeRl?UcH#TcPw*QVR z)6o4PL(SYL3{xT>K8<|xo++rv71DSc_bihmt^EqCq?i@*=vUm++Nsa+gkR5-mMmCr z8S=kO%DTXv;^!rDWMrx2|21djPb}p~=TfQQYp$N%v{dqK=Z>+55~OqOu(ByZUUiZ4 z*BJRYG>|7a68MC)qk~Ii^-o9_I=D)HZ#+j1#mjl$a%&mZ8YiE-%w1=VCzfy|9kgnR zeDbeoLY_m9V_1*DPy{qbW0;SYj9qxF5Tnrcz?VhKr@FXg&T~5ir$MV1aah|DvLj6P zz0Pf6yyF*Q=?cn>L=OfndXys(k@Dg1xipP;B#y<*x=5c@wN>1{UpVt`{lXcEjUzX? z)zPs(<46eth!Gq)N!_5sAjWNCVl8BcK^-9RF^(JqCBS_Ncs-mWX^%;VZ*rSh?PId9 zgBxOvcjqJJ1st*C0jdwv(vJD^!9LE)u-T7FH~LZ6iyoD?4{&K(ucko~K!T@SjIlmy z&P&{|%!+$=6uZJ3uQzF`zm#hbJTio`wno`hl$O zc!B_G;hzG$K7%7pPh<@g3BC$A0$DHU7|80ukqTftXa}ec#3pW=6#wZ>5@@XW)eYke zX`JKtvcSDO|LW|=Fh?!qm2Iq!;nSnqr5dx>g01n>E&ezyePUv|#p8=rzVdt>f0E%_ zFCj7?X;9B^Vpm?2vkd%dmf!p}hBa^L%p`u0)nAa0yYU-XCR*~F!l$vPJf%%j_~++f z9N7ihjm*+5mN$o2jsKL;BqgDwA15K90#BLs-~%<^c~KQ(-o5la%Iw8_L@HV(C+Qa~wfQkoB6@7}5Bu)P55CE%GKC8ICDh;7zOR$qQT zdrU8F^X0F5_P`J40mv%gGGM-5E||e%Uy<>&w8_k$MGgkB0H4XHu(N-ZchBUp_j(>r z>Nt*^;G|=I{37H2AG9PC{Ei>wJAS;t1U<}?-cX)g1|@6J=7B~$eX?+`gnO)jkmDLo zk}UjTelAZ&Sx$ujIbOKg4gCI-4gk_qF8C zZ7ul&wB7GZJ09YN>4ErH2jUaJkufbfsF36x%zw$umaYWz*jh3uj|cN_Yi42~H(h_2 zl%B>j^Qe~0zytmcUicY(&@VV99}eaH_1ZXR^)2aqIDZbV2<(k~j1Ps&@0N7#G5$1M z%?`ONf?vnrfs-_Q%XfapA7a=_oUvNS|AOcq_EVj&G#Apvm3{keAEy1>hSZD3P z5^O!A#>C$4z05mTZc=@}W#+_KXD^-wk1OFHOneaG3BuR6)Ag56g3r<=&P;?aL3pCv znxQp+W%8$$f>`96Dbl+?=bvQvJLKM<^C1ksxl>E>Zpglm^U0d|EB~q`b_mA*sl}^a zj=0^>lC^Lz0_9b5BuC=NxgIUCfl@$=dZhia{B6c4?TX_Uuw$I``#3(@V8ohW3=^GE z=HvM{G*9T!3a1gWa1XH7)saBPxG72p>C6?$(le@F~37TF?C*bqd6eE=E#s%x|6_%u>II!xfF#o456hKZ^@o3`J*iJopgE?j}`Br)U%3TMUMtWJm0)D!b?KH#Jkg zRpTGJKQTeOy8CxBpU2A#k2$RKdrtmx13!ml-Ec%{BY%p`>6hyZ_`lQL_`3@En_L%1 zZgfdoUgX=g(Kr?^C2rz-Svz*izJ%Ts-7D>Si9b#su%b8f4@|zmkq17(2Z^s@X1pq8 zZRYLT$I-9l3!C}l41c*%ORhA^JGb!H86*FxmYn=VOVU1(HWl&7>`DByUBpXFo4k4} ze}Q9n^vWRu%r*TqRtHDW13*Qf{yGdrb@KMt`5@MRC73Kw)o-*U6Vwd~KCC6@fhOR* z-$-ZQ;QI{cu=@N8R`^^#Rl>ibnJi<41eFQQ|5a<`=u-aIn(2_W{0Z~#Ni2IoEoU*C zeI{Rjn?J^~mk!8#cVpsnzw)M*=+mHuV^m&m%B#xxy;?R;l6x!o6vp%1ZG+a}HQ9XO z0go!(;A^P_a?J^rj^?QWFm zQ!IhXkQ|;vJ~pjgqVZ4j4MrvlH%iF?x*w-wZ5U!i z-aumpSbtDM&Kcyjfx2gT9RE5UqT9={+sEa!Fx>^M@6OxP5-|hqdmt)MTWSh)i+Ur5 znV9vc6Q`#ZJ*M+FTF|pN{9!tNPtJKv=c&aGe0keK-AfFklLr^+a#%LMPg?bJT^x2> z?f<#%4yS#T-WOFKr)#9|>tf?|uke`5&y8v2-^c5wY1q&~x#JhQHVvE5FCSc{dq~4N zwep{mb#G6?j!${zYTYgT&99Xoe@6F!R{MxE@0PrJoo+G1{9Ufh(s?qTH*R4x4xn#C zQZTxP;p~olYQ1g>Ga2JL{|c&S3+_&SO}?D1>t$Hq+wz$l-Cf3W?d_jJS+GCu5&R!5 zwb^wk?BIapo2y%3z(jF&6I$ta(yCnDL7dr>hjMifFc_)bH|XxNUbr|iUzf=EU!pex zn3%|U&I^5c%R~&%@7p)n@y6Nim-nc7o^x#<{>syt*ysUi*G63g8__47*{Hk37WPY( z1-j>Fe2d8kQ{v%ov}84C?oP)ix=>^@V4D%QHT%pdY^Y+A@(Ven}41unI za&TdJj*Jshy5|`V(X9Sf+PhV^myLc;4%()}Q`*Q!^3GRv53)9*hyaT#o zmc832JKn=PESw_lJ*X>Z$NJ>r_jQA;cCPdIExF*3ZXVAT?!k#&@_{$oSulUR9_0zKh^v|jg%!>i$Cx%}c%`(%I2a6}@(ksNdx}Tninu7~TXo!@G z)OS<%ryfY#`{X+*RjK>Z_CI+b1UD2(H%{wzD|^Czk1Uw_rH((~$3fKQR%Jsv*ad z;C91x-?WN7tETPq+CTlk94Ds9nm_B3vi|j>w;PMcow-Vhg0Vk?=5=B?``*809!mZv zhFAX&ncgK!xks0w2bWbR&cTKmycTF5AO7UWpTf?;1#oimc^%#;g=S$?S&AmQOg?o% zhnEs(aB!qu*Uv3e9AXwib59)q&`H#%Q!NZ}e>^$zMpT@Tqb#2lj`mmbIM>u@&$HvtPl znKn7?JKZS_4i(DoS9Q^h_gaj(G^uYd5fbtg7KFytICFj+%9 zLEX!7Sv+)3Bq3*@notd{htQDiaBl+@CCTHxI$y?j=Q2cu5j7GR18SUuOW{G+Lge}V zy45W95uz+0yd0zd9qb>}ZR1xbXvnGr+4r^%r<-bj&=5b|MehmW&<_(6Id?Q9c{ZBG zFEr$V9zw1?p~36fiHZG?bv+?_-_iAIe8*>L$e*x$^^Vt&m2d|E#aIoAjg@?dbb0RH zaMMVPz?j+c-XYzP#=Uh3VxoW7z{K^@a^ z!XZLVGwFL-wtJQwJwu~bsEz0ciAsOe~!oGik6AMx1;o1b?mnja$dZCFY~j;5qx(5 zbn+W~zu-1*@W+J7jd zvKn*{H2yD^)PO2LC+{N9ARDL@B!V`B3PFeNuw>{uzLoF;()cfV+fu!^#&7(;kSM6% ziLWyN9l*M~8sdc8sy$hFhm#x@&kE;Vq&L|owrd!d|kn>jV1H&aRnSTv8L#+Y98&u z2NXb!z4$`IFEI||OANvZNi<25JE{flh$#oRZHztzV_}EytjL0k(4g z4xf5>4NdENodMgLq3P zAI#Dp)~pVBT|-8%u*%qQPQ-T?zJonFkws7k+$S!h#R0bib3jFqUq>#UoOypDkaX^AxZdN^Tc4B&-9OLSQ-|rNCYHjg)NH zkFv?%%89u+Q#0$&8uBqZY2sEG9T{wT3&jPd7i-At7&CT$BW>ECk72{crW(iB!207~;{c+xSx1J@jN6>)lWWW9TRT}>*+fn~@sA~)WwfN3R z_HWQ$_QF0N!q|sk_z&gE_w?5oKijJ~3|+&LD-c$Lj>CO!3;Nav=pI|-z3=O@FiJt_ z^)foxW_Z3Yvmfg52YcH~l3%U%Z=fCb70e1GK9ds5nO^jnN)Uw|FJ?|uRQ z;ppco_>4$rQqSz<3k@7X3F`dq(P8{57b z#!pAjT!k!@qA`JJ?d^RFNrOT_VW1q)j;9a-dVdGV7axmh+=RtIppf|{ zn!Y!d1m1GU8GSPAYfb%kfr{WZqVbto%UDpui1SET9t{nuQ?<{Hg^oEILF}zO?DOJ`@3eGa?_l zuAk3(n6Uehp1PfbSMDa!$5d5ue3P%Bu z;srZKQIO?fv<$2qs-l5-gEwgdA={(nkQ2ba$f+|dyT-Balra_D(5WiI7z=Y!39&*UKmS^iOL9EGs7_?oZsre}j%E@%{mZchE zh5O307$waohjpVQfLL_zG!M@kwU{prOJ%q9k!CTvY(+q~0Gs*WGraKqild~B^ z6C0YL2D>9rZHFw z5Dm#A(iyGc5ZiWJ%Ha$aeC|Oir+y*T+dyyD9-Kih6+3FM*J6L0LD6a8=Zd>qH z>N_%ES!Y*VcL!|i>J-=DK+QUb;_4lc5_yBK+x)@FNZ)P1o8BBz$Q!tuy>dgY)S8B*P>Ydm=q28>X<#BeMS#!vPI5S3d1&khDzDN2?w%7`>+9 zB6Bi_TwG|*%?{36YMn-JQ^=cTsMSt$X5ErK{S7%h8$Bp(3pVTsY{AmvSQkdA&lo?= z--so94*NI#7kt9l%}3{Gy0i(NPUO=v$B@Sc-IvSe7#bLM#hAQkt|6N7A_%)0bP_c3 zSzIn>ZrvcI%rSV&he8bx85zBNF2e9U%kc92g@$x3d!kP&`?+C1&RfgD(S|U_^FnWb zz;HV5RmQEtk>;tm_D{-t+^~xkd*z|W4MCjeRA;(_%vo@;KcG83*FF`yz2tdM80PEP zPk)rNk_>0H<{LPUGQ+;e{swL%G?|%q$X*X;*DbQogWkm-pK1u!df?hvsW8>B4XzRl z0jUOWxVGGq^l64@xahyOX@)Fs4}7D9UR~yacON<&L`txF;8l@yCe08DS(T#;H}8C( zo@PzUdyAOz;;cUQJpFKO7Dto-&W!we1fk;`=%hFu)nj{hm1HKeoJDY(5& zE?;M8(6C*%rRe7ji`dh*rESj{V(8T%&CeNPanVm@4%pTBw<*U^M=vt!&oTHe8p8Fc zWRCW4Ou*YQRk7r1x?fDJ>8530q^~JPo^dk395>VHM-{#Cqf;TLH;b;A|tQgoT2kS)gspfW=g z{TF5sJg0TTMocMduc6-G4aeqjQ<60-&kegq5KX(A`AJ*=!(Fub;nH86a+c@4W5A!j zZrBzlM^+ee7`i{N=3PX)>9*`yiQOE`2XcM2;mnJ%#JHzB2@}hXh<@5?;J@&uP z!9A8#o^b1XJ6m354akkL`k41w^*9K`K_?P^^h+xAe2yM(4^x$o^yr;9@Q7wkL(aQ} z3zBVc37E=Kt(JA^2<(p_dZ*=9`BJ?z2D{DZw}u6+`91G1DYT>#7mrZfCIix$+4F0g=Nykri zY;**eTNcc;2j!`@WZ%)_>@u#4=b^(P~elqQ5WKA(Gj`PIfT6bJx3WXd5qQ47J z*pgd6*>)`oYssbsTIFC5q(VCZNu@Y=SU1H%t;->7qLMVtdb}eL;j?cAxU6eKHR^Gw z-?DVIYTk53FyFz!ygY)=b3A|mNlJp$ra}L0T|bv~%?{0hB}kY???tnYdrnc+)RjeD zMu(i=ZK&Y{{UE-``czRsC8m}Y(5>Y+%61{Rpb1wPQTqC1FI zRj(J)7DXC`E5Uc*Lj^jgmvSLAb-63oKPR9o8R61l6xLzm;H{-Pd@Erg{Kxwj;-bNH z0pd|O+46L_?&3;cO?eDlPPnv+tG6F7XUo-$93OZz577+r2VQq&qHf^P0-bYgK)+vf zWwr*{s!_7#955XCsZbS=b}P~w$fBCr)40`NAmuc-P5r4T7tP}dxcGAFI@VA2;r35- z{CdcAxW`n7dyP+_1g!(VTlxl$@avo{0~z?;j{@lM#RGy*C{g|L{!gHF90}>1yWoBu z*I*+SMRx}ZrgQE+^{`dj8Y#C+*m})Y36Uh#VW3Aeb?>J=W5EQVicmWb;3uh z81Yiy_qosGZUY}PK4MV?uk-Z6BgLz=PYYYpjGyWIsWmkBJ%Pl#`k#O;sl6}vQ(GQK z;yPyo^hHI#u5U54+E5%F4$m!i*|J5^-H$I#=x~6J+LA%s6@93+L2X!zRuCsI$b?

D#_@7efu7-5W ze@jy_OKgB{N}tZhnfD*!xe&)g*@o-oZW(syFZ9t~XTj|+y#~>X?=j+m)hN497|v_8 zoHN)VUDFuL=r>pLwZ=QR&`7dz#!xl|@8UUQ61_yEi!<7>`F?|B)fsU)U7oZ>XKZE> zeYoBj!Y1NAGCcx>56XB)i7ll$xIo6uctulwpOJNwjcf463wIrP7`M}7XgwarKz8Vs zWSnZet{J_@$hW5&gLF)g?CoRR!G|2{bvsZ#YhRQc!5(A%7apG_`QcW7vj+EE$c3|v?kwKyOJac0n_h2J6=3wxun{Jy zKFH|H`kSP)LB<1|H=awJQqDuh2=}x^hU9^YK>6NM{X@pjaY<)Ru<!6BWQYrKgUiuIw!b4*l%auh7Xs<*Sw*a7-%@ch!}5?H6YiJ9(LV6o># zgS)}%@G+>c0bAm_Wvp{QBsnj9(h2VnL0SyZ#AzQJC>Cqh+0?cvyVY z7z7s{L*3^a&2VMklIG1fCVRH_S`)3yL{r8RiW=wUUTODy<3VU#$9QvCws z1qh-XQgOKP3&wyqo5qMhjk6g0Y#1D^ANj(A-b`RO*)x#)ER2<`vwBd2WrI}lm~r-G zJ^qdbT?RGkrL&J2*BIbT0>$d3kOpphQ>ri!`0={xjosmOr?V zAyo_Ip@l{k@8Q~|(Z$A(nCa4?C?lSZ`{m0~#$U6{G%4qC2c$UUbMQ9n#QVo% zF-FIH8?Tt|LxScq$jJs3laSCT$uZ*jSwTv%| zi{)@OCV2A2NO)D>UW#N|9IVq97w*Iw$64d4uL;?Jhwe-_Y48c-)7%h4Vnd}wtI=Qh zZSuE`cdtJwkeTd9XYGCRkKk)nQR`;#`PMPp z40ooTID_t=nesZMGkD59cz=8DT=PqDSBZW>x33uVrq${5uMCYH`Z)&@iu1KUXBHo| z9v}pF5q!SxkoAk4=~k@i3n58v*Kz3<486!nR|R% z{4KV1wo5)5Y!0HQ8CF@f5j2A*ifHyef1U*V_GqWQm<98Tp{NHi%gME460U1)sK5 zfnLH!jj_JizqmSxma@wKn^Y3;L+WLp`SzFW)j?Yb&Hs?keA`R5^+o-W$K4T zWLAZLS+SAB+juzP+H4vB1|xi>fw5+Gs@q z5vn2@WsG$}F*n@t57!-tNp72(qM`ID6y-0)H2u1s#Xp9ze=sP zZe|v3-(~Syf9+%1i1PTZC-9Z@o4?EbB6p=VHGWyqFN;W!uXg=2<5|&J2TY%5ZOw`l z^0wwR=iSfyynuG27FQo1D_9?YmHg8*p1U3&|2#lM%K@L3jP3c|X!UnZ|Fhk<{QqzF zMLkNp?^fFVPt9JSExJo-_nrS=?H;|O{ik*hgLn9!SK57>r9GhS|I+R&{r%$7$mP_E zAlmk~z{3Bl?bD_&{9oJt7Q$GDe8l*lT7EZ-x#OmY?+TqRXvrv<^;+&%8BBPDDJr`@ z{&da<_=>?Z|I{vg+gcJR&iWoBLUDcxMnloRKlLkeDWf>|m)5QMwIq$%YhC9%Uf}d+ z7RSY1OcjdEqW?sgWA>#C-qw$PT%1dSV#wLlz05VziRQP)x<2l;{BFzH)MsX-Tib(1 zW_@G*S87}8A?vGVv}b!oU^eOTshmxF{6^w$TW=f4u-M<3LI>W8KsxkhlkLw%KG)+j z%bsPA+W6W=!;JbRHOfFe$CfA9DycK00)yhmq$Pun))fIF8sT_$Q?_fo&cB!c(|BDP zLSmE-WWUhz#5;;OK6lsJ_~G{ZPZoc z|3iin0tvRo2gIGdSc>AbWQ-MPXMIUZ6!#6&W9Cvk;5S$g`d0_0!74o&mog3t@SLXe z$H>%@G0p9z!ivBe^!IGS7_R#C_{=Z-w2;mS0rqS=oeydhW7FAyy3(yZK6CMXsC9PA z(c!7bhuVowVRrk8z&Z9Lvb)Yh?p_i)( z(`4yT;qFuC{C>)H(whZGo&{n2TAfEr6=j z%iGSY4~ta^HMghl+e=() z7~3+;Sv9EXG(`QMvwKNEwg1)sZuw8eC&eF(J$SKnc9FTne|P&YFC-F8>TVA&y059q9kvciL(7fnmJ?%nb(($7VSbdE zMSVN(5AI_E^6HBAyzFT`yfL`{TMuTXmJ-)?r`&< zM!}{*{TFWV(^_OU#kk0eO*$k;XB)Bo^Ro}+!#T#C3>(=Y&$AoxO;EoYDK^)*iqT8^ za*eODT{W_Ip3#$KSJy}(`Nk|ZzDC-eZ#>NU*2obXp~uFbmvsc}K=^;Uc9z`1OW?(V+ z78l2e(I}|NIni0x3HXaenfSHgxGm&tiD7HN_ zB&37iyjGQK*9{3a@aMCjkBpM_s=Ox~`W~WK{v6d8CWL&uE9bzK2vC%(Mnv+}0G2W} zBHOQqglzbC0oi(!c)r4w0FY*^6-YDYRIpAgc~_Me1F5`K!FsV9NG%my3AxZ%FWTXt zC2#=ENMNLDNU#8t4h{){G{IUL0W9$wiinsZs-_C)1^?2+LxKqy_>mgE8rXP5E8YcC zd1!+wr${pz2&8fms6l8h}l3rcqfp`Yn#*rtASL$`IIVe z`g}<60B^deCfov~8EXPk6K7sh^;^GD<;_59No~iFF!tSqQ1w?q^(g=AkT3#4;1$)Q z1*k*>q$cln@tr^i^fv!f_0I?PfbYMd@;$(A@Y-7{Kjs(`y1=LRt9%Nu6MW}@%C`$} zbU@H>TNTs-sVVCetP>-hLqZGuoA0UqCxFzy2}ny=F$y03freov!)9Ou_!c*nZvxhV z@AOdlc3>^|OwVDZ%?TdUhLsWq0%?iO3V&3Laq*Er5do^FtNs9We0grkJqBycv6*z%0P-tJL z@@>E<@ab79p8}-aw8zDF0fXUx;(67-0T>8AGFRopfK*?fH!PSSFMUy!i$E{%BQJp$ zU~(rMbbuf)s|q8V5fF9T;o{E&kAT;2QT++9`|^Yk{>re>0}S4-$}Pa7VWPMh*a

  • fsKtv#dsGV1*HB?7w>TSYu{4! z#}vL!jB@eeKw4$BF1{LAi1bXQs=fzM$cG?v7Zi_L#0rXa4~i#%v@tdS(`g3Z9u_iz zEx%IbO+YHQl&ic6xDtHW9+eLU(u{^z01HJi;ayc81B`@TyQK1M5=;~h!HFtW&;X<+ z)bCSy0<^%t<^z?l0-C{>eW>y!Koj`L->7nmG~;1FTBV|kF9PbJS9f?=a08M$H66PE z2Mu5Y5_lvXQTcda<56C8tEUmwiDn?x^8(V6l>xiHg=Jjwav;^~r~W8$>(ODM4gP^2 ztKrPRR`8Z%DsKX|fEU6)Q3atun&WC0Ujd{Mc{O6dfMVBiRj&g`6H5B+uuuj0$mgou z38cLKw8|484c`f*@#=p+ER?{%{g0}D8&Cu<*uGK)>A+$Lg3qZQ@qba}Q9#-odMF~o z<_l^DOkWQR+0b8kP1Q>R+Q2veU6p5lugWulG{a56bof`@Q03*o#zvuDY;Xx`f$>l% za;OT0z!>o7Z>fA6Fbcf>w#pM=BzUj?Qh7HZ)>UHE9hDCU(rGx|#isyi0fK)(8$h_u z`_QBPza0)e1h#QCU^-9>-s$2UKmxvoj41Ljc0|!9nh_r#W0aM`UfRT%T65jBeW;M*ZD@fs02fMwH1gic_*x2hj% zRwHTw(uBQcs^Kg^s^27%G`prK_31Mn;v?`AQ(%umXqzMPcQF)!%2BZZT`GrfqOwCYnlA58e zR6$kLrl}q)pHw}ZKx(3)u6p ztcFu0WS9==K$_9>z&7}I2~{J)c{s}Vs~)95I@X_X@eM#~+7Sio#Lxq(e+rP6aI=dq zckzu1){Ew9wGEU3X@Vs{8qXMz#vlA1S^Cs<)#`nArGJ)1seMpjT7i~N|n0-X*wMg5zhX(A{U4a4q6g@Ger!U3ZkT6 zz{KBRl?qgHe46^#i|2u~M6JM9_}6|3Qvw}-Qsv!1su$R%@@C)(_*Z{5A~XTZ&W{Mq zz|CKe2rb`W{oi#-jmV=@4WI|MK*4@Rl?QgI3AA2ARY9)(2O1U7^1Z5W0#bRx4OO1s zi^c{%v`;NSct6?#_~rq$5h@=<>jwtkMq?J>$abm`Wdf;3`j0A~0;B=@fm9xK9|3^X z$Sk;>~_a*C9<=qVl*YZPo0yMPeXi5`Y~ zLJa~^q&W)&R)eQVr{fNag`!wxr2a=luSu$2C{TnR#S$0N@WC!QMcP&@z+xJon;Nc7 z!6Raio1hAuiU8B>WYu)(K-x;1fHZ?`lt=4LcfTjZAfhpkdqOFR`nb}3;2;7mDfH`u}-uph`(Ye zqC1xWl{zhV30i=(G2xrc2yjFUx2q9Q?7E0muEOIF3t%Vse3!gXk=KdkF8>iAt%}fG zHNHR~Z6on{_k<>dvu{Ma$ao`AF>Ob|J)r>}-7mlc*jY#ut`qN4gkt`SXd{p}ymU{f z1IBEI992?H5f*C)(#Bc)vMLYUqViEdx&*6MkQK)$BEHO5)Qqis#nu0_A)uyh1k!|? zfk{ZHZY$-Fh%LYb@S)o%UoU1WSSMDy__|ls2y1|$(91-;VLfKl>TwrHOEdzc5mBTOQKVJlbopEMpw1x=1qK5nfmA;UNcGc!)FO>Q z#X>-;-vOjm6Sfy^1iY{r4qEcycc6$?J3t*tk;Mg6zLy^bu@x{F_I$qe2VmMH8RPcz{pkTe|pokbnQ-f-vVj!I!TNP{+wI5R+{{d0Nh**7$@|Ybzap?nlzDDB$ zI)D|xeqasI32XpT{Syiv5!+q7zEN!(7NDY#0tama#h=mu^Izh?h=lx5fv&} zFIE9*;~Jwp29ku|QvW)!3`pZ?b@AF0YPeQlJ~9;l8Rd_P#R_5s1S$!(!a)a!j+1Hx z;WD}#0@OC48v>m`nn3aI)C{!(X-3C@k?_wvrSc^}n&BQ`DCAXNpf>=Uf#JaH)2hD5 z@9zmw;KkpgpVA!HKtLmEXrU1`imgB@H~m44r~ydjO~7F2HJwotGJT0|1zy{#$~(@g zdUt`;LT-Ojc`pU)#TcMsX_vfMa0!yysNxZ^NWprsML}c)NF(Y7DjE6;)AI%NcVIg( z0Z1*;LXmb~g|8F6&bj2k6VR*sGkOWI{V(V}z?835eW8>(@Yl;-itr%qwBPb#D6oO@ z7gc#CkVe!3q$UQ@ z77}pW1=5m@0BOl6b^x_kFfO>T3z!1F8<-EIB`*O|%hXaH-Pd$gjfWzQrx8fwDeF?> zY5yDa&~0O`0gni*SVSFYT-89DqZ%O1P#utF$keSS5d04{U^_+VJ1N5CVb|3vDh5)% zg(CdJzsEua;hTUoqv8Kl^%8*8Kb`Uda$W)lEkO%KnAUUy-30-wfQ7)#H`NGwD5ATi zJ5;?!iU=2cOZ6{R5X*TWZ8M}-mA3#X-=XmMGaE?LN$*4VGb4dwIA{(XKqX-RJ;4KV z^8geJMX?)5{lf-T{~jPM!I+B=ysbtk0%`bK$^)G)KGdm(F9A|{9kBT?SX=&}%>P=k z3Jw~u71#)m^AwRlhf8k0tMXAms%LZYWxz7%m%I2{U@3UVzf^hm2$pc*qwZnt2#f*J zf)oO&T)c<=Pa~{RJYYH?EpgJQDlYEU`;JwDxh{J(Y zuMKE}ymms3ubqqvnc%D0QAJ(@%m&{rXhsD)93$FMWsEQ8fLO8A0cn6%g=a+*KPsey zcL1qnh;CF^3BCeI<>`7V$5^i*R!+bq$n}#(l?*kw0Z~wo$)iev&jTqhxVfu>d>|Dx z11$&`GsVSIghj?EqGVxHX+oGIfYfsBKpIZ(smjBF{pabEk?OCX2KiCFSPi7{tn^al zrNG9czJln4gGSIYokmb6x_OTZO^Cn(q`XZ*JahsZAa8T=F%PH_RRC$mnt@b*j3Vup zrct33`XxT9UI(y*hWB-)GlR-8|0lsg1Gt&hjBEzd5{?0B3GFjgc@vQGcU^pnpQ=|1 zR0;s3{#`EK?LjqT*+3H#9s$yLn=ER2DYJld{{*1b|M2+y(EelFHQhy#`+ccQPDnI4c7of1dU>wOE97!G88l_ z)FQ&Fhtvq#fHmMtgOM=muoGAfK5UN4XNHUlRa75H<@s}I0(kxh(uAGBa>&y|)r^ye zM};!*-9V)zQQ@P?-(+r&(STSS0*jzPv6w~(q!E?7o zK@+MMZB{iR{Vzs^7RWmi)Nq+gM}=naG0UKbx~`%KeJ8~uzG7q&?0m@^VEM5VupDdbE)CgjLv_y?Sd<9c%d~#IChyIDDRsZgE z)juVJ#)p^It0~rrt-uuMjjdJn5}w5X1zt#kgPNk>21P_@SvRUYIK%*H1a%ZqvJQ&y z?|1oYvt0T>+FODtPpibmw*zTL&Fj_hHj0SP0qp(?4=T@5xiCp=P#lcd0;E;pmZL@# zs9?RA0;HxZb;-5Q!ylF@15$YdkR}vvS9u$-3<;Fv!jiziJm@3iB@~f?W*|*RzX2H+ zkaI6MXhiWqs+dg?{k_q}pI3NJjLe705Mcu_1$Y-o8(Gmt)dC`rTA-XFZ3`6f4k@L8 z%CT$*(uB1yj0$FiF9a$jX@Em9Ek%?pun+?g6f!rB3U0s?Ko5$qj0#>r(^iaVz-l0k z$i7XDxCclRihot*tAI3~iq~koTCp2QEmgf;EpRtbxZ8#$moO@f0Ewu21Out!N?T0TD>$Q9HoHQuzvE?+8WsTi#H^*8-_UQc6aJHs}Yx3I2#)OaZonx4nsk zX$1Wc&;-kNQhvG^_7;#@qEyXL6p)&(4p@bNp})jP2Xy-tMn+&cuol?5dsL_chW#3o z6|fH22#hbM@wtgTKq}APqo&s(z(E6a0;xwAkOpu9PXO-%n}8mB(a(WZAEN&Q>wpY0 zuCG-iS_z~jwNpghmjh{r9LFd>Nwj>T`o{y6alpk3`bJEXP&@&o76|?n(%YLUO)B?;0-}bpG*Z%?XqoLwvAe9#>e6S$) zz+s2LK?O&G#e_3h8i7v&QhCgmYL0Ed#(E#|1W+k?tD4YCAkA0{Q0b;$|UWVXIbd-3?H+>xP*uM66C% zyPk?gs#~;r>YmEhwXy*NRt;;^YJSXqp7*_#p0j`KJ?A^0_j%su{r%qe-exjWDDjg} z>Iwfs=Py9<`(D=m`B!y4*A)d-63PfXzjH?b<*4?Q$Y=8n z37N4)C=E3KnS7Qa38nnJkOOF-%sD{*8k8k5E4pK&(5k$^+LJH{Sglfk6VRyrS^3jP zb>J#oCvMOvEB``WS;c#fDO(=OWuY`MdVJY3Vd|u^n$Z$e4WEKd$Kt1I{}_~xcuup* zR*8zoPb;eiI$CrC@$I^S_*uGv&`w=Z9!dkN=aj7`nC~d7kw)IG{iSoYf8@Nf+AF=C zWwlucp~QE0X?p<5rd+T{$OuC3l8SiofHD(J=WG8kl+9Oy;$OHx>}~ecg}S5Bi?I3D z+#r;{Xk3Ox29))Szs{b6GSU*1r)2AU@Y9Y@NWE6tbxgrI3JzRcw&r1cSJ_&CQ&85t zJD~mcZtWk3Qh~8Y`+Y&ze`(oTq}&pehPy742H8tcY~ym>fgZ@F=leG#BqOUG_DZO1 zrKuos1%d2^5g{YVLW!FZo6T2(HgW69-fBm#)cz$nh=24dY&PctOkl^a*7?g&>S?&9 zZ1umu{huZww_n4x1TZy2FhT)g6y|!$R?MZ0xa&IN=vWROm#nk>*Goh7_6mHe{#iS5 zgVb}#&I>c=+lgN4XY+OL)eU#U+~Gz$2b(SF*@3dv0%MV~)e0+pWvdM~$I4baG;b?g z9WVoB4X2=tpa3O)^mZL@-=X~rP)^E@ei|S@1SLKWrK8jD(@VN?XW6n00>XE**$vVf0!jmsf5c{Q zq=bxg)wO%lWh+m9FO-h1K)H*$AEzgh(fOp#@Ie`YXF`u)7)l4bzM$7|4yua2s5_8^ z(qK`@jC6lV`^TZ!3Yn=Worg|U`jgRv6n0ovNp|M(*=SsLqO{5y1*=) z#18(@_5ZkRWwE<|LOd+|ylhRu)?es)dZG9m{#*OI^JQxqd-4_CaQaoPyHhYRGFe#d z)Kt%RRMd<%oL5nMVr*AM?TOw%Mc@A=NpO3N@74w8_UHmbm+At^%PMLu+b>rI>>`w@ zEru$p;e{(Js^Q#K;^&i*?uwdGKa`FJucjOaSo1X%HGmA1@_}nB42=7K@p_#xd;=M* zgilR5-=Y3!m%3AvOVY9ggp{%7Zto;j6_DcJH-Jp4(qLwCbYeg-Aal4M6 zhI0QG;j~2si+AV(J^i}C*j>87T3ik%jZ zwxX9NsRyzQWhMq6&>dWXvXqml3h)0+#lSC6@`S3|r1=x8bb!xnV<+j1@F`WbR>r9W zu$FCbf`Wm!RMlOOg=%R|*Zz)nY^FR4)6#)+v_G^<$9L|js)m=K+%-d&=!Tasud1c- zg{-Pto0co9Y6OGVRrUS4zpB6fj3d*y?TK=Oh&|B68A{7F-3&K?)KT<(?I2Gz@YN@8{YpP+7r>6YV$JA6s z{^M$@;oyli)v!<=S|KP)l!Nm5LKezjWM-6o$g=$>)zrv@GBTljsuhQ_*4dNA=5Y!u zOy!!g`F>EduBaW#32H;REn7~}4NgO8CI6KKhPwE?piTSjvvfnlP&(9ewp$OByQAZ6x?yvt zG|Y(#)#v}Ib9A5$)vHzq6%pV+Pe#;ek3-qLzISMU=&x#O_fB@z^ucnzZeR(@UE}fT z`YNtH;@U|lcT@WXHFYLh>%tPt$tVKc!7U25xOv%mcau~iVMm|tU>b_Q z?Y-I`ysc)HuoJgap1UIrCEn=Q^;+w9=niz_$dfU6ryg1Ou9~%q-SK`M*al_9b8rd2 za2d|K{>ghJpYH_^==cznrC5VfZsY?xe_2RJdOpY|q`WnMZ_Ub(7)|J@Z~u_)Py)&( zTZS@Ki>^Np)zp7j$EBg_7>v+!_&pFfrcYgUkq*^jcR zp#NiRDj52NE)X4N6JoF3SF^%!lsyxHi6om=;vUcq1)%CMl=$%nb$l$P@okSt z!~9(mj93J&KB_a8M#(7h1>#Rjj}O~}P^NGON{6ODqdU|xrW+_gb+QS0ELWiHrFKVm zs1^Ds*Y$Z_&nT3cNy0<_u)7)rDI;oRe#*)a2jF3|I&_S;|3{;5eR$QP3SiF_JL z!dBwbQ2ZI#g1rj2u*Mx<7Jts}ROBNWsFuq0XQAqGp2XT5FOL~^C+vKRT?=D}xhq^& zTuMV({8G^*l=$YXjvs!S26?)TLGgQX+TZ^b9Uq0__f6>m1DGq4QBpZSJ6wcpeu3c_XlP2Z|nE3ooi?iSepf-dKu*Y(aqS(51Y#Ln0$C>^(< z_@}M!>x@Okr|h18rvf%d>x()d0i}a8Khy=MpxVt)mS6=A(s09%YSs|6e_XSMVR)%# z*>L$MH7f}{%N*sf`9Evci1?wDw>p2OD{hCenrdD-DW?omc^op)%^p`r` zQ_%hZlr_tbkU9gd7O#ztVw4C?l-EC;|C3Z5I>| z+bzFl_v2rIVOaPLw-K~|tK(N;O!D8*^({e}>d5~{J%{Xd*hBt$N%HyreUXIpwB`4@ zp;qBj+w2J;KfkXiKmWAC&G-L7SKI}gsHp2t+U|fR_ROEPJp~)ETWi{Gh92y|mZNG9 z_@H6o$R48|ld$p~?BiS;u49)@)P8?6uh4Wf0%a5R!Xo)Crw{}4P~uxoJF1_8EjoS% z%ARUIL;EvA?7()WU)H4eOkH6&oTGvsI1k0Y0K;$*MxZQ>*y4}6wop1G{#pDZZMvS; zvvfV$5)`l zcb+c|aL_2`>`AB&DxdC9_#$lw{kmSi-{M%MU;u|qZAg*7fP_-8A5M}#05dQFWu$Sa zJ_CYj{5d!QqwkS=4%;~>{^4CZeh`jh4?*!Kp=`=^i-a`X9Dv*fF_g8KC^Jxm{p82@Xgee14QLfc@CPo@eru70G%ycUk6gQ`0{E(R zPzU;j{1iNRsh*kf%h=ge)O95<8!&i{_76ZA&@gPqFO+)c-F*MG}j`c`^ zoE?X<#yMDcj**K^!Bv>Y7K-0@o$hca6n_GWzv%`Z(a_wh%VnXe2TFr0P~szD-JuLz zea@mG64K*!MZS6omk3OYUG9<=u7;#;2{VrZIuua$Y4YL=p15h^Cup&=EC{tU2GE=_$BtK_| zgwJ^FCR=R2Ck!K$x9`{Si%{asPad_b%n4tyJ4vV|fMSn`{Y-=1@__bt!1m|Za1Uu) zc)00VyZK=^A4*3?Tss2|;%DKZw>@hQeM%3kRxMne(cDefA-do9kJhK=C(S)U8cuM|FMyOQrBptYmLSv znnFvcf_jcN8f&P8Jcie3N1Z5u+Kfh{`JMfBQPSx;ud#xPXtdsIETgt964PYdx$|(H zpR|9g*BCyv7ttp5!$~&;*)7b7&bAk#RB=f6{C8{+-t_Q{+8N zWsgw7sMlEf3>_Nt8tZ7_b6#T_6;ab;%J4|C@+-szm=l(P!d)#ZxKH)qZFwfl< zf09POZd1O`>Tuww;l~3ToKvHCj*r^`ZeZf+kSB%WLGDDMNYzt)hk-=>Q6%7)qkH zo4tnYo7hE~SD(JkIqeej^C$Su)E$sc+xtG@3(; zXayBe2^sqs0%}4ns2z17KMJC5)Qckf*j+Ia{b&#+(Fhtx6DWgnXd2C;1+;`#Q2`av zI`V|sb*LG&p)M3aA=HDyD1u@rjs}s9QelgQA~AuoXd2CQ@<#y`Q3W;J#5AII)P(}58--B}4WJ=p zqZArL6DWgnXd2C;d9;L9Q2~`u1$o#%O{fL6MOgn%5#TZmIFbqsa-Va#!S00P59?X^NWwUct>7#GvtIRlHR5=_~RC@ z!QrB=CeqS?joL^b>sH!Dy3tLC+!#XYA{%M)O1r(B!1_wyH{JLo>EqP<33=li1vuz3 zq&Kc<*WZR?!A;9Sw7d~dMv3(CZlw*kdW{p@bTjEDH=UvrC%Wl1X5}O|ZB;P%rL5{| zqkx)5o5vG=IgI7<;XlZ7DL8jf%HgN8Az)ADbG zJ;vxp2F0?ZH?DBh$b-2ZK^Q6gyL>ZQ`NT*DLf8av#=&7{@H8tRORUHTwy$0ymHZk`{?1`N4&k86ja zTv3L1rqeWCe4YzZDqa>O@3#fpa?%)(dF*Jxq?yv|TF@;u82{rU{+@TN}L}@gO z@~HWJJS0&FB~cbFp#o~TlSWY#*(i&a(K_n5iz5>ap$u9^o;dLqf5cD{O`a;z^@Gd?>Oli&1ZB`1YPgq~K*K19=FkfAB5pD(Iw$&hkHRmfqTGVXUCxC)=6kL4Ov869TkIH~Ii#0i2@UZwIt_bZ z4lbf*;>NKP#1*j%sM5grCGBOgO>EM$_$G)MMl+E@hCuMKfyg{lt#x;FO`g<1PY=!6-7^=9GX0yxr7Z(G=jQN8rdjvHyjL+J1duU81t~Q442^ShKJ?rc<0IeMV*K=ut;WBh{z3x(Rt}Q^K9!vleZ7k=tdF3dQdkCp``dw0u7-U8b>JpMWRMZ#uB*jk<;{&ZT?Jlk4ZB&Z`pEd7jgFqvktZ-YD!k&-tj!e{lIN;bPQz zS-8yW=svg_b^q}(}QX=kF-*fi?Ssx`2gn?AJ2EVp z6Rt5K#>_$IX(F~PyA3%7yjxZ`Eb}I;(uUQARbM&i?74|L%|1QiA#palMp}%vKIf1` zz#j;^6t_D~5z|xe)5f;%UBEAD?{hAW5D@Hh`dm`rbZ0hVHq>|bIbX(Ub@#pbT&g#Y zYZzPA_RXKvFI2MEY@RI}SG}>dA(;QaF+-bT_#Kx2Z_Lr7b#fFre}A*N)#H}M-dRW5WczSX>OTY~g}n;s^eaMR6yiMi~85Kmkw+@q*}qLk{w z*zyh$fl_r8%6q8%#AWjn-D&)nQJg=h6Umim_ZoY@sB7pVy>Zz)-%Jl}@{iDqjcaUE zI=m?+y~#hhDV@E6_wB}9 z%sjJEPDD{@yYmG0RvsL|Go0sQ=Gj|$_6>A6ue-^?4(FuX%(EMIaVit!kt5(_r}sAV zxUINS=Q+2NyEQz>% diff --git a/TMessagesProj/libs/x86/libtmessages.so b/TMessagesProj/libs/x86/libtmessages.so index e57fed702be47809d1f94f416999dcc5d3c4b5f4..05025a396b5693778a9ea2610bf1d7abdb777fef 100755 GIT binary patch delta 96925 zcmbr{3w#YnA2|G-*|V2(a^jp21QEeWMMMNakPrkxTv|a8ga&Er);2CBB$@iIv<`*}a_`}Xr`zdN(DGqbZZ zv$K0PpXO(W=dT&LzAXva&~D(&0OuwG|5p;=AG073ptxldq1eaqZ;3?^B76lw#{Cr3 zb#5Z)f*t?b4}gRCejWTf=4M>XnDWNL%YUna1fgofDSyL1dFF{^yh44N*e0uUsnEi9 zTAfFeo7?*6kG&3s8X*%G!AaO>5QG)Y&Gw941XU=3unoez5FY*&!X2H>f*XeKLikPC zuVNUo5Cmu2aDzue6htc+-5a78y^>8hge%?5f+y~ehVU%NR>bfU2-m(+t@RMz0LSxo z;MLd<;dBV|_T%C65dH@8Yl`7NoP_oQg9I-#d@XN)k>ki=<$j{MgPC^f7A@3(A7t$`G^*>Ghbt6G2e5J}iLAbJ&&EPz>u_xpO zSJIA(q5*IgK+dK zS7a50%U-!6+aO%^O8Br@NDyM+fR~r(XGrkwE0^dlgqOZ@MR-j&L-=L-HW1zl;TBlo zz7Q^kuqTGcL%0&ckd6JkV<9ZSS-#xA(!~|d;FX9?kRZg(_O#Ipn%LU*)a21^eVcze zZo4LekPYFNmm~9LU{Wt``?3b2<(5EJ`KVugxxWHmMaup>1k`}GKz+H+TM2Y zqm^y)Ho5qTG-R>uwCh$HJuL6-rhW>IyPvno-9w_4+ii#2en~6-&U?Gv3qswrytgeT z$z?Q59QHh$2tRZYgsjndA9eI2jbk5~1T#z|jRi$m*cIyjFz>X_*9wiEnD z8y?t(k_ooA`y3$SZ7utDNf_Hih=n6ugeF3VFhR(H@XMV2A)Nh6I1s{FuY^M&TtEJw z^ur+>4dD>1*%%0yLs-S|R0vnU5{`xNjM`Ts7D2?NSHdX}Hc$Ac02_pbSHe3WT=`1+ zuOJ*Uu8H7@CsYXG?c;4eL2cUpyTKX@6%ea?C6_7)tI#At(*{6ZFHiofVQ}m#Cwmma znGja-WX?f&{VUvcr@^C$y#6)xQt*|I_wSe-9dR-?q2E{p6rlFqZym_+YdkB*V$R45vWY zbwb|l{@>0{fbq?1UyAba&%@xq@e&Fe=(HfM`>^%MfT8FAQB^uELGj6z| zZm2uziH4!!=omBtjYOl+X!PYA9E;&ZG}&Q-um&U2(e-Esnu%^kx1(8THkyOxqWNec zT7(v(C3X{_=`o@VEk`TROK2Tx^mR0?8FfWHQ48vg`l9}5AR5Gl@`VtLSc9gcrDz$d z!s>)yQzIINhNIDF44RCtLDSLoXa<WliL0cao^godD@Xc!ufjzJ^PNHhwKb_fd^jEF@O(PVTDnvSkVGtf+QGrAqkLbK5v zG#Aac3*)a4BZ|;sv;-|h%g}PP0=L5JYtaW>KK|=4q8=6e@tUG4YDCSb zE9!>2qn@Y*^+tVBe>4EJkN-f72tq^9P&5n;N5`NMXe1hiMx!xkEIJ2WfX4ab_+Nq% z@n|BNjIKe`(e-Esnu%^kx1(8THkyOxqWS(f{tGdp2rWiS&{DJvEk`TROXwA}60Jh3 z(HgWCeUN|~b!a^*bj3b^s;Ch)qpqkM>W+G%7StQ{Mg7r)0Ne;fgU}E(6b(be(J^QQ z8i_`s(P#`Bi_Sq8I7|@YFk%TBk0zqY=o&N~U5{pfU2kwHKVSm8|sN#P;b;1 z^+y9h$N0yHAT$IGMZ?f=bPO7a&Ow);IcOnTf|jCXXn8jr{}mW<3B7_=qC$5^OBhi% z)Pnk>L1-8nfkvZq&?RVccO3ugF=8{Cjpn1pXc>A5twL+jdQ=T?RM-{uM19dfG&BJ> z#-LGXEE*1)W8Ffd!(Ev0A4M!u<7<2&|k8Ve^ z(Ok3;Eq0h7lww3VdI_yYYtUM>4mI|4RNM^>MD&`fkQx*g3zv(X$h7tK$=jY6~tEk;Yw zQnU;$M=Q`v=oPdQtwO8O8nhNocz_#qXgwa82B9HnC>n-_qhrtrG!l(MqtO^N7M+7GunXfa4kMPJ@n|BNjIKe`(e-Esnu%^k zx1(8THkyOxa{2hr$B06-2rWiS&{DJvEk`TROXwA}60Jh3(HistXdnMVAH3G6iW*Tf z>WaFd?x-hfLA_C5)E^B%1JR&9IQ~O0A`}fn!_hHl1R9CPpt0y2bO9QNEP@ykg zN>oLSs2O!d6Wnma9rZ*ls5k11`lA78AR2^*prL3O8jg-ZBOE3Okr)w$Mx!xkEIJ2W zfX1Oq&_pyDU4y2h>(LA}(=Lp^%^0yA%|f%$95ffrM+?y+v=}WxOVKj49IZevaryYa zf)SNy6lC4M)eI5olx(j{hi(h(=@3Sac4$0F6VJpz&xTnvAYN)6w;42AUa!<9{W%uM z{%Aq~ZUmx1Xb2jLhN0o;7&HQnM5EAXGzN`D=b#H5!t*~yEJ5SZL^K&)gQlbF(F`;b z-HdKWv(Rib2hFt$<1Zg03eh687%f3d(K56gtw1lKSI|nd3av(K&{{4Z{|_*t4y{Lp z{y3gd6*Zz})D?9@-BC}}f_kIAs6S{Q{{a{ghz6mdXc!ufjzJ^PNHhwKMq|)ebPl=z zjq8u&e+fp!qlsuTnvSkVGtf+QGrAqkLbK5vG#AZB3;QQHnz#rfiqR6Z6fHx`(F*hu zdIhaStI%q+2CYS-2RO>jO&I9d$VUs&BD5GSK}*pxv>dHKFQHe^O0){CMr+W7THJVm zs)HOAGNNYG6?H@1QBTx@dZWImKN^4rqCpN5gb<7fMZ?f=G#ZUTW6?S20yGX?g2tnX zXfnD6O}7iL5JYtaX2 z9a@hXLmijQjJl%ks09r{!_YCHef&paL^_&@=AuPt1zL&Lp}N723b>=*Xb2jP#-Iz( zjKMhmb1@xQs zzzy|BgU|>x8eM`Wqnpuev=}WztI%3h9quU4)nS6*ixGk77&HovLleETL1R9CPp-a$sG!xy7ZbysIVzdOUM61whR2L2v~1ot;0L~@}8qqxw5(cC*kh~YjXLM*o#73Oe9Q(*!3jRY#h@r|+20^A5H#B;|{ zA(1d_+I~6V!cLo)*xQnQe&0P$aj=O{k zx!i;gsF2S$mcq5<+Ne;(-A09C?g_ZI+>=x&<$gznGVX0ElyjkvRB)k>T;f6>xx$4$ zQptrrQpJTnQq6@vQp1HlQVS+1&`BQf2t@Uvhr`X(fduZ(aJ9%*yM0`HD=YwjC;N@;kn`)!(K`$M2h7FTQV@R1Eb` z%hcG15*beVurm_zhx1?!5@`);9y0lrx@FnVII8q-(^UA^CEOB(N4e^WH|fcO^<+fj zY)7-C#QHztwcZ9|CC>bSR96dR#nb-aM0(R4jWsfot)x5KZzS_cH?6fZ zF^RNGW6jS@%!K+iW+5)5AFo-W3-Qy@e2wKcB?V|}Eu|TmK&W?P_G@#P-#RqbesCjw zMcTfxW_BkhB-fflQ0CFv7Yz+*N8K748mwu)hqYU6$vhJcKdepcO5QPeIIeG0N(Aix z+cYt$(0)RbptT~Cq7G{}`jRryvkWq_rWp@Ib}2kNQ%ctxke!VeV9)eye{#m9S-$<) zi7BD>ynN374S7K2zj~c4C!~gT8B6y3j|O7ZW67`N4*M>GG?3fccjL%FJ*m_T(?}{I z*H}h0>14<`0vA)u82jZBb}E|eB)_t`)5!*Ml{HLPYT-)cwWi5KDFs^z4Z9M^WOAk7IW zW=9jqBXW%8Cz8ITEvro=-O1O?vJyh=SoBKbOTN~uD@l<=3fb`0q#Lk&HR%Oic2B`- z=-e99){vQ`Kx>^!AbgPR`iT5N@-=H3`GJzJwCQUJl}HYYv5~GMjD2Dw!4Rt8p`lF9 zfYSD{J{cs0?9-NI5D!B3vP~OFf8ghhWCr;{3;u*$5y>vraub;bEZsy_66nmEh$|sG znagJK8QH1r-%Q#N^0{_q3)x1Fo4NK<~X+yHe zEJAgK?8{xGqd_gS&kjdw3fc8tq=}{2amjDOVA)jyJtIzq9-dcZkBxi)9V*XQ$lP|5 zu03nuNYjMhU`U!+vB~>$MZ>bp=v=2K8#ThXm88 z!))9hG7P3JwtWxG0}wv8hs@&P4%sB6FHV5%(l%~~^MeTiY8H7Ka!4z2oO_zt-X@VS zvGBH;dfnt*_O~f2&9#tyo=s*aI8M;ttX2Fo5Kh4`V&ql&vDuh7r9xJ z>K)mdp@^M)`{jW(Chz+YR|Efo<|B?qd1zYgQ*R%-FCeeyP`{)6Iyf4M*N3h7f_RyE zL)lx$-pIuoJNgB2^Z(~MeN#^0kbRB_^G@D$rik07aTTWVKf=lIg87w{_L4TdFTA#w zcqRNh&+p;T&m65Cc>_{jhgP_0Z}HEt1ZeSZN_h#k|19CWXKkso z2d6V@J+G`ek7xR7mzr2{%RZ4U5|WGgTxzfAtJiiSugl9qpib=1y`@`h{RvL_N2 zf{<*dqt`^jz__ufoz>=;LnFaYzJ%=E3z_?uWI{(@`@!b43aBRB(0+mG`Lh_9kKx7! z+mn8Q7XQKIXY(&)o4+LSG@y{RJm9#wg&!bqwJhW3>RruGtDK)`=5za5?YyM1a|cLU zKK}0?AS3kY#c&oyY3aqShhQggNr&1hS8+&??bnr1I5lV`CVVspPyw&A)As% zDjEmaD>%^>>Zs^vkmI0y(vF4|vf24?v#Tj)pXU=_8eYiC^GOTdZg=vDM{9oi){%*j zRbG_i(0$Pkcg8?JJ4ifQjj@L#?Y%74zU?v`Mu#~qqL58HNW5rdAzO8jZ02WSDIg;Z zb1;Jih3ta@5@w8Zg!e50+3|zK!u~2C4?MBux0J#~yARpL+1nFt_%Me}{0u566%wE5 zcsR(KgtxZuQ@??N@ghy!SmqdMJFnQAX(BYwq%GJ5(k%8uBl$4?EYY5aqdap;wY~A9 z?B@ejUROvwXmTMtP)OjfX*71NkaXznNE>yBw@alxcb9`%P@2j6F$|UmCcl%WRj1*P zg99U^JGwPzlNXP7EYn{k9N!V^3zB17EQ z@MBkbz5cb;Yk2-{LEh*Bhz_xrx!zY zYhaaE4AVP=j~A2P%^U|!y#yKT#QO2f&Bw{>2D~u5zV93-ZOIDF$|1|yfp16{^)1w% zd_$&^<~*Hg)mfN9;H1;xq#(1j42{hzA#SbsRj@A?BTW;&<)vV*Y3mEwjuL1{Xr*H% zq@7cSf58c|_+|bV|119&Cy85o$5lS=C=!c`{l6(^!gv2&&OmnZ zJJPeMqgF04mdIP{?e`5{QRY$#Yt+Tswo>w3A@kXma&nK%W7hNJF`3I!E|9m#TN z>8Fh&v^OEa+B!<_Hlwe(Xe(OKrV?%IqHXk`f5@~Y#CTCy1G%|qjXKbkgf@54Hh9xk zB6W4qzU)L7Q0n5MdHT_{jY)%cAcWpCk$c*V6?dh)tv`jg%k$w>C}FFFY5cb~rITzMV3@PTi{2KEXV68zs!~!^;$~2m_f*K)^de70~YSfSBTx4k{s@aPbjw z4@uHui$s}_54HJ6MOeqLWbb}0o)PCjc>|7#omH|#oBy4-MJ4k!=W=nIMBdi+|0td# zBvxDglL&vbI717*DmEx&5(~a5-V)>Cz>dF*HAGB@Ew^fM6S2n~yd@5!WUSV3M|^H3 z1KIO>aXC%bYw^#;){-&UB&0=HRBMvuw|cgV>e{OiV_n$|LgiHFM}o8ls_P=d1?ZyJ z{RI!UT3eGYnbL3r+vB3^!oB38^EHGUo;Q$_reg!LS*454@=rMT{|Fb_!~D4QjdgI} zHn6W7>-@R3jdg>$y_)DA(oQDL&sA3|(herg(@j?*)3zqX-%drfI?{nDT6yeABPgwCW=K#kE!UkT2>Sw-<9#P9ZxuRM(D16l#}; z>VhfF(Q9tQb%%)22&bMJ%G5$KA`O;Q&^yEI~7nx%g3b9&jY zgQaV6%Bh8Bc&vXP3wa;1P+cTnUPwC0(Snc_Yo=TUsGgmquEypX4r6 zr$+2-fK*2fjkMApQaGhbBi1ZXdXu{>P#VSkK2QpwawF#4TWUv*y;;}ZQXd|k%fmUn z*@oUyBoE)|Ed^7l5%cLIMSxmjAIV0j*oZX^f{ehKLDB^7*FjQO$VmGuNNP$bX~fii zQYd#+KdCd{vAmzuf>5Cm%j^#+!SDM^Q%M8!36VwvOGBjAJlUrq(gI5AnPHGL4S0Ky zG_Xazvyk+{Gj)!|9VQ8jX>~31;I!mCNPS|E6ep58)@LYOf;#s0P^mwu+p~A5B$9`0 z-*BmdJYb0s4BPLxiNn{4tVX+NoA`pMD=VBBQs9QloTMZxL( z#ui3N^ThMN)oZ`Q1|iqkxT(@=a$UPRRT?UjpPAQ8X&kU@rgVs0W<6%XL6=#|ENMRQ zkaLO6oGq-Rj`Gy!W4BC*FhD zGB)#lxL#%K#QV}v@;!5olXd_H;-tCcG;6(3@)3oqdTsbZ2}b@Y_TwTc+*Aq`$+Gab zY1Sk&?DStO)r(?CRXyvnOp0kStqK-9b1dPh5ti8RLM<_T7PQ(XLBgD6(3E0NNZ50^ z)LtA0QBljKa3XeqsC_G>En+8#8j&D1C-&@SCQ9qXu@Ln*Q94LT5!;_6O{B(tnwl)- z5OSCuOo8(|#CoiPi*ZQ1w+hBPyw56IBMl}6tZ^y?3$%Nw5=F@H2SvIdQFYi^7Z0&~=iRq0q4+-dR-0-d`sTajFS^&eNs4)d#bZ zbX;I;Sq}<9K|1CzTO}u!@NA`V`&-LuO0hXx=UbU-k%|Lc?F|#gPr9i5WU~g@ey3rrUvMpO-q<%h@m2ZWqF@7w2v{f2LyN+c8wn?!tfA8HU zeF^jT#O>0zgly4Te=coM$wuwKUTKRA>yMFNNu>&%+*@mYNNTOnpl+;ZF|_;juG)rT zDMU{eu?wf9Q2QWhaa#I~koUFc-%GPaSTN2!Cncy0pnoLAz$L9YCuPF2J*8aQrp|jA z>3bd`=dly#r9H5SpL;=i2Nv;{FG#tRyv;uPQTmy@rOmu3b)az1uwIfDwQe)aS%6OO z3X3W?SX>!lDlywfRk+1y&D%VZwRwhewkw6K?uirxE72ZL|M%|Lr&2fA{n=Bg^Zz>f=ciHtEMQH~q}Htp9hv)T zn7KQT7_VAwzJ+Y)GpQG(OL!(dpy~a!h4oS&iS%Q+M9zf;?{q46r=&02sFQbzvthIZ zOY%@cdb1Q+9wGAU@PjPJ+4l@rWCJ0+*f>>QOnPZ&RJjR(rP+18{64IWwQ-H)+5EnE z)FgYtebJd+Gt2v_pEFzQB0r*?owZesnSIhszCgX5+1t(KZlJccIb=Y+U`I42mDuBol&l?;;^U9mBOW+Cn z+Y<@`>@gMm$zukw{%z&YhVq<}PUzCo)9jgDw`X=X_5AT$kQqes%El&_S$iJ9(mk?=cnR=jDuG$?fE}E;)lB|6gojxF_)1856p_QFg4#l){&S`l1SxLQwA ze`1xLUzsm;A=U4&n*>Rnx&<~^?3ED@6zU2bjRtY z&U1B+BY6k71qT>Y|1uRkhY3e#&yP*+Ah*!-9J=#z*>VU5I-39P@muCJ=mflT+WSoR z{3Kp8o)GH6i|M}5*gfCgxw_}ua{9Bg9b``Fdc(oZWF3-%1pGf!DUHN`vci0c-l3Hsj zDC3FT9Ov0R!``3m`SXIIn#uNqQy(5@HgCC!;~b&$4q^wq<>noDCXPc*1$W^vIFTcc zdOqhz!U^(-{_L)|+#Lr5A4bflqujQIy&1-P^3elh;&kd0o<};2V;0j<4(!d3emUw^ z-ms4_NCVB;eZ1wL;Rk2-G6Ogl`^aTKc9c7};)gp9GZnxUepyR<@vVJiH#c7V%R~7Y z|GV^&K60zk{O~p7oS--_Ui6xg`qcBLf(-lFbuEY6i|+Vs^U52%QIFqJ*Nikk7Han) zzA#wq8AFBPq{>YNKk}HNO!JWkQ&;GrKJpLx8a`tc?W>v0e(of<^5J98RN&~GH4|QT z&gy^q=0EA%_{#2XJpF%nLm0YaedV_H%%OKd=F5HM7Crc}JZLZDF7`nPLpk=&m!D>z z7nGsa)Oa;qkeEMDn=kgiQ&GUxN0DZ3$?3>~z_kK0)c0ajUjH2%wIds+5Q|L?xZ%j?sHUty?=qxac6-tkJvETb>%o!3Qf<0$=x+1|S# zz6r!{>mmnJv$OVV7kM7MmvYud^^grZRfYxC%6V{ikM1qEpt3Vt)?4n!J z`oR4{a%PkJ$bQ_kK5`)UL?7A0z0(J7%{phTO<&mu9t@q?h#7FK;CtFN}hzfWWVZQ%wcUOa;ZJf@INm32*<5 zCcaCU{{Xp};g#^10dN~}gdypLj^&f5errF_o;qoO+?Kpxy9UVZ^*m8vdUwd<`~ca@ z5lrU6h5>SySF;QpD7QC0H}a>WT5HlWXp-3jw`d9*4Y?$-sEP05=M6%f zyESP8?D}P(+|da(UhZP1L2`R|aP2-wUPXNuvm=9KPwv%0@HJU;s|*W`J0Ulm*F-3Vs<`UZpVEXE*FVgVTG7KQl2Hlw+*c#3D`#y_;Bk#b8ye%Cfml&2AR+XSF+8N~HxqX73iokkWyw^EN^zLPB^W=Izu@CCZCMa)b>?mYYmUPlVO* zi@k9pOa-^C@jZl<4VU1R`5L2=NUTK&LMv<;Cd*s-o4?&EObaO~t zbL%i-DhO(1Dk$m;?Y1xl9(u}lvb`y?J2g(%O5tG#z6+Vb%2&z$;u#p<;%a${cnCJ$ zTrCHRrLeJewd^ha1RJMTLz-gPs9!Ay!3%@lYveYH1{Y`Lr7SjWjXVzG4z7{c!Apd} zsq%c_V5)ooK8>0A5rhu1A36DKaGITioIK{0Zf9e<{64&G&|76MLrxYStomlzd@}gual!8T(nL;M-H&`^>Ags zWQGm$8^Dwe@_Be4(B)$&Y(Lxfu{_9~x1KI8w(EsE&EUNO%*9E6LQB;>O6KkK;$xTz za@azfe4+(UNT4C}UxPLktO<}xc?8{fF&mo!^8k#2PcmdZp*e@yj*W6FizDL;))jg_ z&G4yjmT3Zffn~Niev&rtEIZ69H^SWqvUs);9u2?Hx_lxpgcr*b+3rmF8+b$U)~7PO zAl<>1Z0~2ox>v3M zZts;tNru*CA1}?O1?NEf!_Q(z_shPNtY^0mz{Ot2mgdSKwBj%;%9VRV@KG+D(pu*8 z6_m1;P5KJ5Sj)D3C4XhO;`qfN3DQpbm)wa~9oAO=OKwU?I@^*bM|-^APcPgz`9NDm zCDqu|n$wK%t=N6j)T9T8S*v`xgQ4CYKQ&2rgoWqB9FxXQQs>m*F0 z@JfG2soa^s`<`>BpxZ9gx}26JLgKU@-^=jB)q8B*8Hj$59Xtc&yvxR&l~aLhXJsqA z37UHj-X$zxKc16U!7HGsa`_Ux^yzUPZYA^B!t+qSxlFns^H)IoFUUQV-H#ir`=YYh ztqU-32E57CAD{@|9a_g9cy-^=22{vroXHf;bVFXIknt?-mYf0etJd~5uVDlmeFx5N ztmaiCSMncua_I~my#SLHJGF>0SQ zRlcRPgON>kQ{3G;{oRmy+wn8h9ADUmhtAa6>SHh1aiikS_PHr8Jn7eNN(ZM65BP5Q zBrM!dt94TXNFy&JyvRB29Et+kRSQV%2ZY^!VswQt%gM|HG?k*$19Nie&;a@$OYyDl?#P@3|q z)}e#au}}CixOv6CTe# z>Y#Yg^aQrIgAzd9jI6SQ(t*!qdT*sI*VkLQ>DmlRfBkXd(}jzIOa-UlcF1qrULBR* zH116n-BD@L%rQ^8S{K2D>gsPQIPJK#uj{CE^t10<*dp;s=Qt_xNvDOO@Wkp*?Fsm- z-6QEFwHo0Vmq*F4JY{}u@W6KfFF9uATj!`|mPg z^$a&fA7v&_eZ@!N>%C7pDU({-D_~0g1wQV9y91`T7pBE^QbJ)AEM|v&l{4Jb&dNJ( z`~VWy?A7cYbu;=Y_60H4Q5))~z-PX2*!C{U+tg&#jQ&a~p^c2}VpnA$wPdl8-4q}0 z2i+8RF6*XD_ca=YU>$s-W8LrwdPrP5`(veb1N?s6kW|-lapSa%mBxEe*_HekWlMKu zi5ZSaX@IxCo-4Y-1^)YQmeUCwtj>%n3DV2#gWPyLTcHx z(eU+t{L+FkN`wfX2#*`9Od|bR?pUR{^PRee)cd@PS(A$Tvx>1wSEuBMI3VUMVCD#A zBupJsBNQ+AFgBTUhwY6}X23VZ&f}CAN~*Q@CMW|%SQsCiq$I!(P(3CqqhWoVGFjxxvm%QC_3@v$Tg(l%Hhs3p*17tyw!udlI8K>EK5p=`Cd>%GTujR^P_>7_T3S|#0 zIZv!m@7Q;xI>Y8^gX>%2-~I&x4u@sR?b!5?5J)+;Hn{28%9 zse^S-=*LPd{2a3XW9YynN4sx>NeDiq=&(_NdocgvSlLAR6YAwm#T7o8{y@9;sX|2f z+W4={%6QsvUmLbXsiekjSa+KzRIA5!xC+}?%64eZtxVdXT!ZDw%^eEdB{ws}PT0Ab zP1>pS2Q~pcHaiZ6rxousAysForpI4E52jXI=uPO?ZB`6UsuQiY{yfk)sSbWG7zU*@ z%2Ikc_2O}7thP}QF(yliAfK^aS;}Nur~Dzr5S-`&%(XRI+DSpeabXo z=00UU@Z&y+w6U2v5NTs4a+HN4&op?yGL(GGQuZtN;0vLJUn*a~7ecH}uF{t-8l*+# zDwGl{oAxjGdzoefw4#40g`y?ZNl5((?lt@;UhmYC)%+vaqz00D!fIO!=Liquka1># za+$1Q?`bf%*B5H8g^H&_615)36hdkCY%TD(QbJ)x^88z+y@9-|wLPmGr|{LvZ{19AnJ<0;30mERKRqIm+WbG1T?CdUO>aTN zj@K66QWQeQvE{d6+(l@?ca$mcF~V$Za}CTT)MFW|y9<9xFNb@nr;eGY%}^xy+F{Dl(8O@5&a;g-FCIq$&-+QS#hj|8sB#xk`_ zlPFk?#<`^>TN2^^!k74S4zor?eO){Ur#geEGa8?U#!Za3L^;9S7i~@Y7S8-4QBM#W z8mny))lr0&eZcPO)V?(6FXkhuov2dFCP?a1!`%<;Z{ytVexUs#sqiz8a?a&r~R%*#5=nyG3(8gPV-SJmM(?lAjYRR__aBkT`V4YnMFDx@Y`_>*H? z2W!${M~&X=gmKoSV^G;Jy*kdwA5h+Nf_#(UJigGYg*>k}4eDB%@3Ago^9`z}*4#-A zA@0t%6#<5-Y3&}!_fQ=C=P-Zhv?l!sxxH^xXWAdiele=y0X?b}KGfk?Z56)a6F~mI zJP_cgTwQhBTaZmHf%k!)`f^ipBR**DSxk0T>%<=*uknr4ENCRHzL6S0O#I{vc|myQ z{mtr4gP)U-HpHA#ys)Kh$6rm^AQyFl_$wT`#YOE!{hZiE7d2X}goxgaRWI=|Y)o&g zy3x)~Y*}MyIPgeg^>ymPhG%S4ag>)F*Ns=Mg@d%afQMBQm$^zLY?hR`*!?6;=s zM1KG7*Gx?meIe=bX6pMi{0QsWTpcF%fQS#9t7Dq={~dlF-)&012siCR@RmTZ9vKY# zs++6fqAP6la#NS{Yzo{|3-^kfI?}!-B5|(Pxv(Z03=gH{jx|L87V69swNQJCP!g+cp^h`=T>Jb0Gb6Af|`gcB`!l(B<1w5U_dZ?X6-WZu4 z>IB0a`{_(fg94s-sDYvxb_BFhC-Vol4QUEovwY+QlYWV3f3SV!JKsG;ul< zr+PsbYw4tY>81V&Ul=*Dt={Sz)R@KYd#g)nGbc8$BeWVUolkdE2hyg{O?=cTyfx?g zsG}jQeeI+6(^0dN_QFqHL8!@zt?*ZWrH!1lsa@e)Na~)Y&FHT7A=C--4p4`RFfFj^ z0Cgi2q9ycD;UyHG92|=ymfG+96 zS!)tZzwB5a=&Wx;QBHl;E=}X0Y2m)$f8R6(rYZ8=nlvAFjqaIX%c(D2|J(zDVzSWb1iqP@IVVh;x$Enrd=`8S9KEmVM z)+Aoq{b0DT#EFgWr&`5T5OJ-a`Xv>d*t-7eLNN;>lo0hKd>?f-L|r20L3r2z7=JI= z-2u?kzk%>;1JyUg?_guuKsAb{{EJ;4sD2MGpmz;YUlY$j{MkWjfAWkq3RR1M??crf zSh_I%VD%UBl-(SxPA5;;&>`x6;NcMU0DMNZYp6OJcsNwuMjo?`Vd^yCVHiX{VuOd- z+34UV=OODc+)m1H=wJ`o-Qns;d&$8g)HUKos6y!ob+qFVVxij|FH=E;Bv|7l7|)Jp z9Ib0O%=cnBq?|uWZB71S)={d3XQAbfQs0CZa=TcIG3rtADWteOMokvUAMk_4>*^ix z3dD?hLme^|{@*xBPZz?S9TpI2OW-TW)_=gQsCClQMb@Mye6w-V(>VCP6gI6kJ^a@c z`VDlMK=qktHJOH^)n(yz>eGF)b&l^bQ*3AhWXp^43snpc38)&UE%HbvGHn0d%`>8 zRS)q^*kPQYj)(W*Z%$A{#5f4=o}exeKZK1Ik!p}Q2R0%jRSUcoUmmHBA>t;8D4(d# zfz^NTBz2tq*0OPux=Z{Db_7pW-T5V+JXwtpcSFR#CaeA3`AKzs1TRm$Z}~!{#!VPM zw)(Rrbqy=Oz4IG$j#AHw1(5VwlFsCr(v&w1Yw$;O3=5o54f+9sZCGP3vY#eh+3*`*_p_O;h2OV0|GQ6|Md$9*2_p zPgg(SIUb&_j^HmAlo{$Ck^HP3n5n`CgeAM!$yv}TTz6>VY}JpDpIEn8xW9b+N4<71 zR_$$s@7nLhsZ)&bXNeUtLYxjIm+hfMZ= zt`2}5_dZv<^P`#9PPIL)+Qo+Ngo$i3OWCPL;v<~5H3=T!s&}eyQ}b>uB1?sp-zGM5 z7c`-HH%r^4egVtw9=p{MPUhYA_YAsf+t)1DTi;UKwHs;*Zz!+sQ4@%_HYHpA3Rd8n zX)i2z>|?57pXw(DK)w8PR3EV|Y)r{f+rtWcWsW*gd;`K)b6~uG%v$ePchHdCta!ib zN27PM+Wl$}tjIfmsU9IbeEWdKRe1uGjV-R&Nq954JWIsU^f|3uK&f*P65z(#CwP`ie(jm+fn~WAUD;B_CI35V~d$ ztNunU5=$Y`o^RD{BD_yx!%Cp{UVxa)5_JMRGSrr+VZcpi zK~H)0!{Jjgh=#p<4_>mwE%2Aeu;*4MxV``T!Ro)#XJ)e#->GlWU;bwPrRqK!n$513 zsy+U*I7rm?l-i=n<^L{?jXVXn@z`vZeoFPF@kiL+v@ zP5**9mBW3ZCYuc_hdFIN`>I?$NsGQw>zJ%wcnWP>0Z( zFIfH$YBs!9ol>C=)ltvA+U`s0X=34(Su>>dtnY@v{{f6&cUYx`zz5Oxw~ao0$(DRX z%lKLKaCGT|zd)Dnc?-Uk{Y7m{L~kg@a8-TX4d#E|b9B{(s~Z|_wXm<3%xUgN*!x%2 zR5D$AepS7uBU7}iH`ES__y!!N+*YmdthDjA`lTs`Urs(L=`2>;n7! zrb7$Fbg6??KaZJEr?zkP+HGhkodr&VfA$!WdcQjHw-<1eYzHYf)~SB&_?^RQtFWJa z`D1udA%yVf8vlLJbiJ)(raCL51M&<oytMKBzF>sx%~qQT zdDcFGdwU9OG=Hjg5xrq!*i%?AEP;(rpTafbb-wgejcnVBS8R?Y3U2CQ_B&T3EXgA7 z^hVjy#X0`)~mr}5Sv=B{@!+=0bWB_(=>SLTkHz2CG7QwbsPWx-C2`9htxBl!|h_A zw(U9GXlcYgcCk)>Pz4+(D$(_oxTtJJy9D;pHAAa z97!JuFY+Z>e+>S*=!C3?H&DHpLDeq>QdIpG(o<`%*TZXv9<0QmPay$pxRZV~u*pe3 z$uq!S#W&*r&nm+IwGOIyX&-Yn>O}Efz9e>mPA1CHyKT619 zq0ajL@RLEZvwpWkec|srn&>k~hfW3|F(F7OYv5FbJu@F+Zou{*7_aX zN3Hb%O>X^<9D_adt()?<4X+f{^^v`>`5t;N+Sb56@z9T>@j2|ShaP591ADEFK7?-1 zVXjXsWg?q}|v`W3V^hi&%Md(he(R^+MAq#gz~zO8;Ms6A?{hyVGTJ5;8< zeiUtGU_&hWB<}YX{V>|nz?ymK&w|?TUV1PZ z`NvyApv(C~zYm1pLVm1{&VJkgudU72Bt8t|!l1?Y=gXncNUo5`abF66`%>I~w$oex zCT(WW)Q);LBG%uo*V_B&O+*y$)N4IDLGPhW3~ZIJzLTilsb~4VdLJH9>8r1$jScKV zXZ>O7Vqk0i^gmOxfxX>Dzr@*WfK~bsb8na!;#;tgx76mE?5{rw@8=EdY*)R%*a&hW z-Sj)DGqhJXeVW(`A_BVW=ToDB?dq=YN3CD8o89%v;%l&DPJlj=IvLo>0R0xxA0lS; z&|Ab_uwm_??*f0u$iD8O?_;0Qp7qdwCOH~8g)=Qr$Zh?qCfpC%^!bWVM-dh|A8@52bmpBtPjs)u4QpKSC8mNb5 ziUil9kG`Xm{a(Z0zdSh3a?i@HY-%64*06U)AN^6jObza@-R50qgYPli&i}WSb`G zBY=C8^y}&CnQY}`{X3*eyE|Dwo4>3dH$|Tf%j*|Y^xp%gr|O5hRqFpeDAMG#5I5)m zI{3|CKl7cYe}h!AglYO1@hp_{`!s!`*&QBS;Wk^oC=6eGEQ;2TG4tR1Osn*;?B_RM zczb@nOfnvK>E6zL`q6XS%Q(hvJ)bkB{9)-Ppz zcM{Jr`c+K$PU*xL>_B61e>zHk37Zis$Lgmt<PCrL;l`I|yeRP$aAE)=%hOX6oPM_3{9vA(wHgw z07VM7Pto_lR>q4d`g~kqx&665RFg}ZeW70ryPq42c#^?vLZcoakNiq5j%J!7w zG(Fu*lRlcJU#1a)pP;WhVx<4y1tQ(E)Aegv5thgq`cZ5z1k9a*ie^UuRezR#99tWf z)LD?=nXqKf(jQ_yVVN^q-wfw^cFfjeC;TM2Fb7KR+<5859DR4K<|yg7K);JT3 zpbv70nT7s##f(`}yEuIkwmhVqczv)IS5x>T{Rjs0pGnDj9A0@nicC+@XEEzXlU7ze zE*6g_Bdz)j91szfpj+83?O3AsvuBbcq(4{cSFku-F?)@Er51-P#4q(D*d%0q>X-Tf zEb~n$$=#%1&NzjV;hRxgv-(J-oAupT%^IoWH~Qlanp8>Nsh`F&uRD_Id!Rv2c9b&r z=*MZXN?%h#789kfC4F;y&3x(lF+H}|>A?b2(-xGw?+N`FX3I`;_yjt>xkNqzJ>j-X z>X@zHpj-AKpug*ImsZc-)Ll~4MSXh>`x-_2_LAP4Vaj)rYghC=7{5$X z???R|W?v@J{-nRp+{q-5f6}*N9Cnj>dHO&m073Kg&KlPFDWbfpAHqcKBDb%C88b4; zv}^jowX=5N;L@i2_%|<=!478CNoV@!AA)oKDg9Wf;F=z1*T#@OH}pZQ&(m^g%?)TO zXNPFaAsmxeaMTRd3epP4Xp(X$M8 zyQO`9>KTn@Ad&L*iS@8CBIkQXW3ej4>>c-3lib%a*}vUkIt++$uzrW#6EfnVeijq* z75Vj{zNKR;N;Ti=3Bv-nf4^aq0~-rgeX2M7&c?v6gTWBM zxb_4+9VJ6uI40YxN@FC*R&s{fN<*&78;o}B44~>(HCPyLy|lTi!ByL8F#=kX6Ywu| zJx7Ne>QHnUMFgKrUUdWO3v^&fql7-f@NPX4edcP&(J}S((!N@TJq+UJ;^%Q5Wq6uZz7k58ZNPgNNLkBLl54Cp4-FBHj%-gB@P3643;0lQ!~NPlhs_3 zdQCP2*|Xly@Z8UI!zc&#CH!QIAzjNl0rzFR!NMxj0FOk&YJ29%Ch71}gI2?oY?4l- z8vfuFzi+-Ya2og0O%Rv4o{8NWpdKcp&~(5K&6;L>wMn|T!O&ITX_Z5jgcC_uh4_uA zKH)$=bQN?V4CfAFYR!AM7B=s{tc9tyQe_Vb+h&;A^gjvXvxmIiX6W)Cg-P8bMQ=CU zW9SNl)PILzoE)cV=bo5~}Z__Lue+Y`y1{Mj(iY1T7L8D_j& z$V=TzvVS(Xka0H+RoUUF!+AFiu`Kon-~4Kju`&4OwxJI;?R))ZXy>>ahg`7aHq|Ze z_v+SU|L^bsilG?`EqwNO!wu{QN*nJPVl;SXqS^yPA0y5djVdy1W7f=)9G@H7vTQ11 z4|!n-)#7Z>`Z7b1LE|soXN^hpg7PGL;}koL*6;GhmMqiML2?m|Iy=VGL8|3ybk;HT z9i*1-#>Z}q!9nt9WenjoZ>0&Hj9;->_7%Gt`~Tgckm#<)1oSy^T0kR45JUW9-Oko=WTc8CiSg@@i@G zr$#4wT6g~-;|AqctZ_P;&V|9oAUmb0yN)#0R2FDRt4QO=K55`B73BObnCwGx&kFHT zm^foJ~Dru+S8OLGd+f@*dR4#z^COb_>`NFv?iMVC2v$%D7DtmbW+2{FPNitS&KrQtcuNXkGCW{;{qAHb8;p z?I$V3xQV@qvihtxHf93%Nkdi}S8ABZeNxF9V`qlli-_8$8@=sV95|4oHXF@avZwl5su5v}-G^K5VSUF!_6>U0Fsm%S7*&t{yio(jygV==a9CjDy?0H^_L_UUKgT z<0p*I0pg!)9LnTRmeO;L7jX1*zcl`$@fgEwtw$PPFUcO?4Hm*}g+M8$Wtff^Z zmEJH$C@ho)-ZakDQl?$MZNx$`H(UDsH%M5G%SfnVr1{Y9kW43>;L=qTN^{rPn`Imh zNPX`an`oHl9GQ3DI96E_eRALEr^tj-GaVlooi+6%4j`uINm?^>W@P^By&9@0p+<#- zR=erS|&?a=9;$bso_13X7Rz%cIMiCm};d88eDw{RGWR+l0+1S&bTy*8UXp`Fd+LhB( zoU~I<{3|E(?3Cl!Qq}5Qik7KoPd3%!zGHomjXt%xTFl17WMXZuK32Py*XE*|dDzoy z_%Z0-AXytBSz#DS#+tn~R;nVX($gR5EC{-^8)<+$61nHdm9Cn^-+LH%nZXZiWrRy7ThgqtE1~ufu*-4<; z-;i@-nBXJik|!6<4n%G{G~(tnF-J&tBd#UmVozQ+;(FIh0h_HWEch1!uO!eLg`qLc zkV(2o=WSg<*ZD_#ao)=E{z@EAWt{Cv@5WpnyBBFR@a7(9*>uEhYR)a^Rv&pIzwB&} zeZH(CB%?Xki@A4%yl&3DV1EHgaSN`LJps$JmfT8qErKuf<<@K1?_q1$nybyw9Xu(> zkHc|@+${2>E!T!|%pwiiq0pel3Ig{LiuV=jmp4)@1gBPK12?D?2m z#<2EMSAXuI7JYNGPTV9D>rhlqHud3h81JLfm{6`R%N8K6Rx>s^|ELTI>Dj$p7bSITd&?_4%QERyjwiWQM>$nAMFj7lj$K{x?aW!H)*GR*D1uyG&;OkafSn_2C z*GJLNg&Evi_G~eB7IttA8UHDy`c7n6c#c)!oe;ED@ENlc;{GKpG*zyk@D(KWtKxF` zu!a<7a@F)4BFcl$Xd>~*1&va_LUAU+>2Vw>G~EsIWt3d2=AgUcpAP?2 znu8g;p<&QIr7w4LJ2lL_qonsOP{!90Ab=a?A*}a!D>%XIM(A_hI{rYbn(^Pjq z$NXR~=gex4(F~4|I2UeGKJ~HXaHxbr_fRk=K|2^ujrJg=RSL7@AaX~`R#BLT4{U*z z!bmp`ax$bCc~JPd?H=`|NmbCtD;Szv6uust3kxka*G|D-p_I0AS5q)~AV9h6+1;jI zb_z?lAO4`xVv-~uwpW?-4aFqqgIGSod9_J~+@WU=j4@b~(`{DRPnd7CR5rNn4pt}XK~q@CX|MT#tA8CO<`dn6tBO7HF^5q_?Hx;wPgywLTO401S`9UV(kLxk z!NF!1xCv}kH0KQ$xZw=j6*lHFw?M--hHcRm&RfI!z;@tA?l&zxtVFJJvseRI;(UYa z#H4PP2HxNXF(MTiA^w}Ye~XKB@+LQtW%g|*EpBt6>`0`SaGPtz$eT&lZ7zm;$M#Hx z?JoMY^=PMLV}2`=`Wt8WehqYFX;s|)jjP3iS44A%`)m-AEf;+VToAzhZXJ2_h^u4DSP!9$ZI3H{S+jAq&-F1EpoARsnCs6bm6S`FkGZXE z&D0YZg?Q7Acy0cnsjuZ^)5DP%1X<(!Pms?GxMs|)mXwn)2R?~)gWnwo{#y+z0Cn8JyOXU3z8dQQJX|$!i=_k#VVQw)dGVX3)0kCyZqlzW<`2O@{T=)8XU0Oymp+1jin2$**_4%VZM|1_YvSQTtIWq8RBfc-L zNgpQ}UVPL4;!PfT@r@kEBI7E9)Ig+uV}7ABtjcK2r>OHg<;-*^Z@z(h?lG{F^71I% zjkm{QygzkXD3%vP-JRRFTOotw|#?3oMsx{#ou?OMTsR{qFo{j;P(H}`@!oO7( z5EnM($1*<0Nl{Z?VlRU5yJq}k9nsjKLE?{- zFMRmxOvG^#(wtxLZYC;SZ_YbmN^+b8G~ufqXvz0r{PQb`!I@^S<%b!!X%2DpMy4lc z>24$~2M%TAyK-e%JP;CIsl#t7m96-eY!Bous}=u>%~wlPja+QaA7^}yk)>^5+S^H? zetd0~?T!FbKIVI}{b4!&F(0msOltV^#frtTJs-$0cDTvTucHX5q&@#B>c~#&(~)n( z;4t2zPP~_M^b6EVpgAJBYZdfNI7T)Lv7PS3@6x;>Q#K};b`GZ%1LtMQ$Er}7hjW{ z6CW0oVxiy#$?l}DN&*J)OR$K0X%K%;!{nbN`y=><^oBFJ9l;N0oKKN}A$)}9cWM0) z{yxTPr%3!T{$ml%n~pCcAjo0XBa^_7%ft9MjVlftpr^wbS?U{w#fT8)xF8uff^W{w z1KF1&c)UJ`jF9}%{0>EaR*&I*l)26iV<42MD58nt8#5bEk)~1nAjX_S7Dw?GCgc<` zj^*39#=N9SOvnGbBw|tIPsc(+m%y@hEbpVIqZ#Xwv2fl2o+&*=nvCN+;R4U3ar{Rb z+D@f43MVh)am+B-FrwI@w4Egm$aQc#s zuQiT4O@>e7eax6B;>jAioYAke6pbmy8|o}DnZZ9jzlmM)lrv=CG_=ZdXUNlOe1vkC zGGsbGQ&nVfl=~N_L;L6CkVn%Y^Ym!t&>6fh-p5@r1Nmp4pv+_@q$}_&`E@3Lg(~mN zS-fX$rG*9fAY-&^a>R;1He6uCZAIGXUe4m{GWlnv>a+PNIR18)q{Q;AtIs?8t}pA2 zA^XEC$}zcvXUQ+Id@F4EiF5d|to>iu3Y^1Rl*)<`lClx4Pxf70_M!Q8oXa<5!q1U_ zx%?+=9gwb>3!O%NR#|xOS~On|yMeLFInx0Hlg{E~=l5a7SS+Z1Li)|Z-)0msJh6aZ z#J2mZd~1(5Uc-hXo?h`_{SoPEJl{xzhwtPB9`FB2(#k}BfQB6pf~QIRFkDpcwU`f8 zMz9+f^P}0=zseXKPV~3-8d%&rtsCVEQD~wt!P@ghozHNeu$%`FS5zv3QLP>i?X}P$SJW(aTcS74$~b-wNd?kR4mje~EkRLssxBmGCcC@IhFb zZ=c2=U>!?QX=!{8+ZvXmE1?LrrRCE4ReTF5l&_>Kgx{)WV)hz-DWmjvVM zYxy&5Pgu&=^6ePU?}^_!zO!ZvnZ1s`irZm*)AY zX=KL+{v~svFZp^SzmIY4O9pS^1Jt?ZCLV`AX!mqzGyj`L+2~%k1u9{k^y?Pho?+)9 zvOm9pDoQ{xYi#9z)UZooEBF@OJ>4IdYG&|w$8EXPYzInX8B;9h+ozPu)`j$XNm4h* zEYZnv z=M>B2gqX%?KtVGx>3f09?xC?>Mig{0?%*6&Y;8c0(6K zpLIC38=|!t3}N^1Gc-)d_himqNMkaowvYcyvzYMv`4bGT7G2uUpVuUjwFjUelgRl4 z{CsS5he-TT28UI@JH$`Ih5DL@`72uNdp|wK|Hk0K^$Xd&J*$~buIBJlo6koJ2+}@2 zt1j3j4-827F{YC>?#WygzsrUX3V?bO@xhE(GW#^fgtN$n(|k)!yren9*LT1T`34u! z2A7cj7okUzNzz5WmQW(Tt+)}97RNb}{TKP=$a?T4{*QhO5j`48)*>aDhoKNKyrL0T zZo{W1sdy@QmGMs!RlJ9SR~8~0YpHITJ&DY@48@dkgzUJ?|HODZkp)-y$m(f0Yl6kn z74#6OS`@faYGpVP`3gFn3@544j~Ehb#*;aD{3Islp!6&cqZ7?o()b$R7_TS{yvD~g zUyf8mwaF`@Fa=Bsb$Gmk9woG{D6har+wdz@d~6yizs7rLqNG~a`QKPgr1a({zm&cY zY`xF-VKHC%HJ`6zAc(ayM!vbbB)f+^UO`%GL-?;D)gJTd*bm?Hn6IJLgi03*_+I#O z6RBB*Onpm&iuf*g?mfAPe@jnjE-prv_mH#2;C)Z>rWh5oS885@c8{xe@(X^QUK2pp z$$aDXM_?5qr^lNj`Orx8pk(`o5@qv<+}hZi>qrjPOPqE5Q)XzHe^ z*+fTA5Cvec;!BT}VShg`IoceI<=(2Kqmv1b1v!(kPNsQG6CHW%WC~!sb)-pE6Fw5- zB(1D!3e+;5I`WIFsWGFiN*vrw%bAaKB;C#Q$!G1Wy;b^nL`7{_WGl>H_!27ZlSmwa z|J$?s-=1R?&v0v89hZubCTdechA5sKy_1fJfyE9lWUvW`aMKxyJ{cT zx1G06_)$m*ru!0cDJ6JQ%Y!a~AbW2(Kx^@4Rbhoz2OXGy^p&Tg= zc=}4JO3eA#?i`p6oVZQ%fhk z^*8OriCvwvvm-1R-w)|*>d2J8B42hkxia!AB6T*+gj;IS#njn=*M_I|GA+WjmN&gj zEgiM5QGfI+k5y>>x$8(kZ{SE z#{NL4shq_<@HhQUTlJ2wmG+$LNOpvoj^p6jgaIaevEWb18g9a?w)aW9L8fLf#tniD z+$XySnL0CtE_g-Bgp)Ve#~W;F$6(mIW(cV8F!A?8O`GV`-{XgynrLt&Y|RMMXBwq;K4eF`2b%nnt@<9bTRQ3hI6kdoM~2A&*cjRLC2X^zkM)jhD24ya_i~f0B+yn?`DIfYxK8sXjJ(0wdld(Ak06qT2YJE8Zh(8aN+;=$L{pg^hBLn|H4SINT&0ewrlUH| zHu7ersSS0vUS*o5!+2rGI#V<1zP--Wm_;9N*kI}lBVq%pU?^D(my^4zs?n9PU2;mC zuPb@F!L$NTmCWC0s)y^xTQ-6{;pELmlaHo9X}Srn0c6o8liBjAlJkf5Hd()AbN_`o zvZCO-EB)=^G)Da)2kS^?` zc&oI2k7KIV;Gx5YoWhVj0E#|=I9@b`VEi%U zBC2DtwCtj(rcpCm+VQgqw=qUY*4w5|JUY@#k4@`w%|e>+)U*Td@spY_OpROM(yKK& znjm5n%9g>DFX0R}qq{#F5T&xZd=JeUbV=@aWX20Y>QsBzYk6vY zr9~u7Hzx%}J#IGt)iZ>g8erWUy@SNOGwV|s2<6MLb+UgJY*3Bo|?o+b#-a9>6%jjk#* z#7q5Jvcy^N#{BJwv(UhB64~uW#}y}~N6rGHG5iDsecL(qd-AfH(4tumCEy;OL0d$V zIHS7gx0I3xMkM?=)d;Qm0}aAG-ifNeL8*-DX`q#c)(~*A+^m(B)D*tdF}1Z))4BqVN7U3xbIn3SmT}XP zZ$1*{GOk+E&{Oz=n(I7;nT(5;xHQ7|U0@z)B93t@OefTT4Ry483}3!}x}zVh)g-1bn1v)r@SQb}XMgJm3%LNmdK zdHh-mZYJPhk`dYU6>8(H6)lT%Xs^mTn`ZX zW5uv>Ct)>n?jY6&g&?QPc%lQ3zbEIZ=DOU2WO5gwI`wbcS-3_`v5U~_zoCVcbH8-Oc9Q!H%h+4496RlX#O}n?G7%ai0cvA~r(~0n+77t)Nsl|+$ zRU_gtP3X>WdNO{R;K?}a@j(+I2G21XrVGtrv{Q|7iXp9?E_AlTn~1;85r#3E<5K&1 zAi}pWC(IXyLqlAgFSy(L)71w508%txXwF2t5%U6}K+~Q$#|e}0gnE3Ouo_R&)QT4x zHSB~;EXt$09vUxDRc{&Xv4kCfZE32KF?7mmFGa@-WAQl~H|hF9VI{skAWcXVWFwX; zCZq}8Vi4$q+!Ym$5lURjAhJD8=vuWm%|%Ktbsf~H%9rVK8q4*5n{ zz$Ek`cfS!@Grjsq)wc?FGXwpGnaKH4a+(@TNQ2$N1{kMz3zzU(&B{Hf))b=K zivTGkVXxwPL5*ZGYM}IA-gbI&?FZoo?uqWo6)Y_7gEqS? zyuy=vURO}*^yNKaEKb?}c18Gw)r=!Yt_pK8e{Xk9F%Df55^&VE<8>jK!N=EfuM3SB zeEZ_B>p}yDJ0#VfI|(Int!A&2_g-BkUX-xNNAKN)lr4Pca1cvI-7e8?~Gme8PD zma4T*3ZD3G0_5G3e)g0!yCKvkn{EjMF)A#(g^vqs=;2$Tx$?Dw=D!Jn7!6MQO>kxL zBF~cF1ur%N^>pNSRBZ$q{D;uWf-^HEv=fduyU`Y!ZZ1cY%zsr_i1m#g5iC6+xX)aL zdS#il)W|T?ek7)4a*7fnIm7G-w>92;i?(IkjTpK-eY>C#%hNLqaarbkcxhtH$!hG$ zYJ|x&LXvskAX<<*4UOs46{mAi6NC6z?@YL*y#2bJ@&4p34X37xCB|E zR0uMx%Kfan7Or+@`YX5?YGnc)+yX{2E)aexTjj&B*;Z7*>_pwps;+3+ic&-d2t%z6FW~k9+C>R~lS+{zDCm zeNI_MsDrjp7%+s>w>Dr#P?T_>6P}R~v+p=+E+?lcWXPQ*5nf5B;mKsbsRKk7+?q5q zMWYfF-wbmKb)_hNVv6GW>%n-{@}_Z$LZvy6Y(^cUFZ$TR*$50!Nzs2jQ2ae$HzT5r zphA+mv++9fYS_u0dxDDv8P1mLB&C_tfnxV5KG@obsG6l^HGyUi5oti;qQqEpbR6j9IK&~-h>T(8a3O&n_G|vqmJ1xPcEgfT zG<2vX3bA;A&MahH)`H7)2PT}x3>MBnma1ew(t^SBs1-k?8v*KY`pCTu*J=6@H+nlU z0>hLp6!l zD=G@)Owvhsptj>_G-~oy(-fDmnN`W%4-86C8S{z|?(+UansSC8h za|o$LN$HMV^f4`5Li|T)xuf5)0hzS`Z(~JNDrEy@$9t?$}62v`>X(g zGHC=$JRoV53x9J3Jir$iI@MDO7HAHEhYyQbzz=6%BJmmyLjxwp(l9?m}3p`q5-VxT2KH;Qv*= zqHK)?Jo5vsji~|D3Z_PbBT@5z|M+hK`qHkYwg+D|+F z{2SDyU!!cuo*$U(|JMUWZ7CWm)P@WjA7b`5z=x_(Rd=Cyh>KQ>nYCrJlQvltcAXJ+ zU8g!pW79~UAn`^#o~S)0utK1f$f)eSrtC9v9rz1@x5jB1pmf*xr3>5{O!Jt*o##Dn@O`B{P zp#87ajVwg6sI@pe^rV@Z4mB|kX;L*23N;aitVC60M2LS%v!m9aEvx8(Z1N4h`_UT7 zzHf&(qu>mn0TH4~(hDX378)mxD%&Q>l~fCgQ&AY?KNN|ECerO}g?;Z8kDXctr2|(3 zKonecj+Vs}N>yWZ+T<`E<7d8#T0(e6HabHqv650m&*Fo93v(zfFNE4hNT3&*P@s9p zB#J>rrJ~>{LpaLd1Bq0sBu$Ys)Hs#KiYl@7*H-7tibzp;LYh8t`C2Ma0u@x~ja-C4 zdz)dgdY^=MO!ghBG(tfgpsr|!Kq$hfl0A>lU(leQV3Rj8w2`KTT22%xGb%9x$k4>T zOwwr-6mKP&{gjIKsX7CExS&neo~z7;3Km$i+F2mC z8Z>lTatAn~wD=3xBC#2GT`Wdex(6Hh;0!HU65X@CgW=DYbmu}@|4=r_#y_9~$Rm9* zjoD&0V^oq%?4JtZPWXtHZ90QuB5d2cWXe;)W#HAmP|oq@5=4pl>sJaIqDqxlb_Uf03w{3gE_;%tM$UMqdIx=R22p+L<)wX>HpQOw7qJd`I0 zwDWEu)_1qj{iS`dx$4`uPr{C znGQ3j0#_-Wdp4Z&hl%e4(W)}RMVeJA1Z}l+E0&u9g1f3_g`K+pf1x6-KmSEs6*$X> ziIm%jV{DoF(Ep#@IsYNOEgsANcPT3}6!oFaX8> z=LJFi|GR;59=)SYuVk+XPfJ;0|F=!cf586A8K4RH+7dLw)N&>A0^`32TR=yF(pxG5 zng5Ll{)xsPVQB^^WL3%prKf&0F>0?cX_}N4IE&&zJt>kBQ>e%k4Uh+*Zfa7x3fc;M zWm@W2QL+j=HJ{kxaJR)xX;hr${lqE<`d4;ZI&9r1msGv<=7E%s`m?Y<5ap zLH~V&61g03uwjXO{-YAv%i@VY8-&860*tUfX$oS1d$STb5D){{2Z#iYLaq%c9R;s$ zB{J6o_UG62*sWe;R!59)uzkrH{< zQ6NsdgBq}Zeq4nfFwLhBakdJ5;rBi0Cj%&shIP(CVgL_Zp48e*YX4VIj<_hkL@`nS z)Ugka^c%=!-aE6xPVxT}P}=_$SpK&G|Bu31KFoCs8y7yrQ}>+fYAJ90q?%dRApNB! za`68i(EtDM1OJz>_iS<9suX7h?!$CcJnFx~qk=HEQs4@|e=`3mOkSlpDri28L&>D2 z5+P;frq?C%(6SPFug#?P?Un4*T>*36yt7l>{BqUpWiu=2{yWoB*GAm=gG4GKqk#<{ z_*F!h`0D)(G=$r|!%I035C(__JZ}F|&guA4z6a0-yp-Jm-hgO8e85YKoCixefD&>D z&}7`Csh>|A5lyE6DR*DWxd7(&J1F_UUD@xyvpfI(E{=C}mVbr(pK)9MMP%AP{^xM9 zprLRJuqsz+P?N?=hOhsKu5#_^@{q$#Cx z#0;<%0VynQI+qEx;Hb2iKkfHMGgWU0Iya0Q_MtW@p| zXck#2zX2>61=f}JHl)Q!~muwmdTpcWwK#WnS60Vnf&v@GI?)8 znVb_}CN~C9+>QlhvIrPAzf87_n_DK&n^h)%J+n;qno%ZSm{um=N-2}KCzr{07pnm_ zC6&o>q)aYbSti#-0!fHG6}UQp315`S?Ex((m&t!lDwEd$x&azbER#3pmdS~LgMe4( z%jC;2Z*6}Ek-23SIR?lII0GUI%K;UHGhiPK=wQRi-=Anq$B{C*3ZVMIGWk2i*$MHmfK$js1ve^8%0UZ_=newbL2jQR;9(>}aTFp@xAq8V zMW(2mRzqFT{<&L)gD@8Y!T|LEg_-Z_zCs2*G$qhX+EylSg-J5e{ujYQ9bedB0L+RE zP*V^0Z@Zwu05q}{Fe!c(%n^X|JImx?z>^(iat0tA@H(SRehi?6`2}V_G&Og`a}OM@ zh*M1i_5T%!PC_V(0LL$t$xe`kju4I_NU2H$Hszs7L1HJ_q@Cj1*_z=bm=#b#Utw-W z!W7;h<6BT&@xxxp&ck0RCbgS8!Qo30Bi|vZN1DyO59X~(e&sg1?`JA~&M@iPe#h`B z+L6&CJfz*#anX>Rs4j+S{>jnTpAM|YE_o$um%ftk!K9FDgBdW*z()eo0Xcvt02+q= z$ncNE?50-{2;hAUCLrS{5a%Hf-W7-ixZA)NX2^|KQXfz88atxqYuOV}4tVkpkXu9b z+dS+wnDSaS+dvDG!fxQxVFr#+?N4ktXSgGVOAEcl zW`lvO^QBWYGTBFTaEYE-AahE2s#NiYNjGdGXObm8V$ioh+a&18fiyO_myOH z3*fbcENLOuc5x;J@+XLw{%HC+xZXCLkdrNt{$==8RebO$3QPy=yW9fq4`OXCx{JTIxQSPR_R!8`g7NUm&-lada{{#4&wL9-o;UI+?Vm zJb`k?j;qgl;^d2bZCOt`_=>G{AsY(hG%{s@Se2ysiXP0qEo7&!*nx5WhCK5XJL>|r z70BUaN*9#BPb;ymZg@t4`~`W~T&zh_TZs<3gK(FU5lcmPa;TM9hnca9+-xO!>7p|W z0Bf@&%DcCWw}`6+dHQHRNSm)JZuW z9B&8NxKWJ^YKOA>xR3?yfWPBH_Oui0>D*ll<+G%3BzRfUPV{5a+(^@p!JEYDWDqqY zYLJ~Di?t1A)Lkad?QK!q4Rudy6v_vQW;rrZ{IS@V_4gt%lp8mNU;^^ky!5(A?A}9_>NwS+`K0Ks57E=>$==?s<`89Yn95=NcFO-Mkz@Q$>J? z1E<*Gq~V-_I{{ov>R%zIBUCY^{=V?P(^x4|_2yx46oR-GCB~0N6a`#)rG#i(i3P3{ zxa(w7vQtg6x+7$Hbu)6fBicw_Gjh8lGU@C?-gZRcN_~h|fY?x%+PqNqAlU)nww z7Zccm919R9GBGVl(@tmuC9TNBPGUD5H=t0iN~UNKZ(b+ST~`P7)!M-8w^-BR(sXAE`C4qO({_=NMTi|E{_(!Oi4El4qT1ent^f7s!M83(|p_fm6xE zF5)LTZbqRzmCRp;eErb{k{TRaD5sFR(QtnhC^lfy=91olVh7i-`Gqoyt^6qCwm4U* z5<0(-T$(IaGbqXhh>i2f$w0BG%e-aph^aDgz{d_^ml3|Ef( zRFO;VCb}`D2gvqrNQXOID34SNQrry%^UErfmyk}MA>HOdVC3qHq(_k0lnuXKNM;9# zP1vY{Lh>CfuEFJnGOcF%3j-9o;*e%}At?wF2e969Aqff=8|mh0iXZ@e$D=g{=$z0R zgT*>J4_%QQs&eyEu-KMy(-T8?aQlXyw4>%~k&NpOX$YuVB+ntc+k?xyx+8P3)r#c# zq;3Lwj%VG)8q5thV(0<7jWvqob!3?jWmXR`%dt+8yqAoa=HyCZdq9t*!~FxPn+v^? zN!@-PMRJU)Vej++)8;iOk|mW1b$TK{`5%!%J;B_AEsErmWJ*7@jiOq(yK`BMe1Hh#)kGr#vFQ+S-p|5;l0Ubn7VVli%&}fXl7CVQ5!e@nh#5nM_C>hM zW61Kp;O@LBMRI^j|1+ge{h~lWM{9Rhkz7}er(S=tw$5jEk=&B%3U@N1zgVZ9Pi&Fgj!JEX zqKi@$9o6Yu>Z85JlFj|ahOFD1A|kJctljAkwd^;iNZz21I_$&H&8&_qLSLk?IEcD$ z#M9C%xnB^5T!$_so5P^lN*0ofVPZes@T4O7oGSAj2cSxxSn)gy{8N{ZTmmO{DfG=)L($sz{Em zN-hq8w0nLk`s=dz=kf)bWHmDPQ)DsR^jyBLy7$1%I1A+Br&PK{@|K!uRY}u9FvGma z&_Ss1d0x-uRZNNei-bFyiy|CAX;=(~6iPAh1!bpW`DF6~hTn#Q)CmN6S(M0B46l97;o;L*-x~;ZF2J!S=-7 z&*c-Uf6!3)JA^%lMo=2rqM;CnuyC?-C|YRZpyz6fbS2Ki&}ae&lcvK^mW&Z(;V=wH zy+400ceUju3TAFIbiTl$K`fxHeVEuw=RW;8287BmDs(vFh{6?w;bIN1Sm0f3VQHtG z4z)EGxXOd?OyJ_DlbyrCr}#PKHZ=p|h-m~&b0X<50wQ@i`8ftVyXio3%?Q!8!SLnJ zEJNR!^MfA}Kh?ra2b9tfat~V0(AG4Bl7>W4P zE|Q5OA^PDzlC>~(5kEbbok`!1AxD3V1ke2Qp3ApM-8-mf_s<}4_pUxC9X~^#6qf&- zO!`c$#iqfs6d1-?Lymn$qhiUU&!7-}@O;8EwOeeC!5wo?OL|8_cWtpFOCr%dUbZWi zr<3eaVBw`mu^Qu!XH$V^qU?!b6k79Yd*TI8Cd;058wGs6JsC$$vjbT&in7Ro92*7Q zmg+#Bk3za(dQy9|*saAAeX-n;vIe?lAu|VTQO8U1tzjC9<#6hUp9f$ya1r=G&S=DY z&_E7T(;1)Nq-Fq~!5)KhtTqPBtyp_y+n>I9s^PL;K|7`C{F}mjK;Yv z2CUlHTC^u@6y#!~saSR+Bm5wwk(zlsJy$AyHSwU0N-L=}ogdUq+tY(?q%3df;wmj)Kgi>RKhMZCr%oxq-W zIvkng`V?c;Q;Bc=c!ZA*BL~KV^C5%D1Nbw}LrLvuu``o7oQ#YH{_u{u&kI+Ia3scVbn-lT3T z^h>iQi~X3GbaHVrBs3?TJcH?)mySxNB849vftovE=$>9oHtZ9t4-SN*5CnaxZ=i|m zLt87A%2O(FB>a8W|7(A$G2-Dr8UFQXc>HJrtAR^duO>h;r-)i6eLXoa1?~J6DCeu~ zw|I((bAua-$h_VHkqjKW;INB_WB3NL;ByF5{6@0#a};LFMskt5&uyY^7vlJZ z=)mT0Lhu%1^&0+NFtONFETgYhel!=Mz&URwy}v+c-_2y=7YJRtnXLUnZ0L33t75s1 zjh2?;2557Eqa8l|c>taOm;V)c^aVuDcMCDafR%w;isibL06&^WU?6h31tskuR=10U zWAYXJ;gnSmQ#2}h?Ve!xWlm#B7sA=5-RuPnG5(unY*am9k>MkSmFaH+tRd`p&06YIG+ zZo~O9;OLL8-JFA;`!;fB8nna4ZG@SQv7Ezp;x!$dE8k8&nGR8L|E^emVw;~$N9)Rk z`#HH1NA=fXsGy%HG9|MZ1IA5n(d4Sl5PLGi_mPn^#Cl#C`-!PcFsVee*4MI8Bh%e_Y=cRFfxBX@tR3h%>mMTCNlZ$0kRl)mvaZ+F_y~8iSCHw z068`j1&Ef&0~$6Rm+599&d@`o#Vjc3gNKUc)~foNHA`$=Z{zV|c{swp>l^zk&5h1vD)KAj7N6$Ru(wKqwfE1Grba0jbPEO~S35=$;ISx}g~T+)=9 z+6$z&1#_H?>r~Mxqp&Os2BNvwi^+8hmY?q3C?>K6lgyNx#bn?-VDezGz@m%&rC8oc zM+h~@sd-{sHt!Zl=V4%A{0yb6^=ZU+kpwxl4O|MVPYZi*?w^_lwCN^ASh> z{bKp8Ix6&7AbPUjJ}4$V7l?K7s$DUev;aYehdJ=MV%k+Gv%js>!@0beT#7@O63nZFc$)7jCGsWO3A&Iz@ff~n88SB> zo!RB;xBt<-jxH zb;#ia)KG36(qW-k&p^e^8H=0&9u(iuE&%p@9@Ni^@vKY6EfgCLom{s>{+Nn}I*bd9 zW5+3?O56NC(5J$`LKjioIp9)0z}*5a5jgY-%8xdiLf~@hl4nRm7g4W7mML?qk*15p z7Osx<-_cV2He+CwDk1eth$R;kDT_o0uLw|#QX^BePX;Is0#~77sMb0GTuyy*Y!Mo@ zubDhs1P-K_iAy5P_y(jyqBya-bB7X{J^@F6bYPl;p_d163)S_6n~7-fGdh*Xc;s36 zdEnpiQ`+SbQ<7NE^)mb_cYDF`d(x?dc4%1TOTs#~d*>4QBQ>znUBrUIuQQpI1fh=V zOx9BK+s@>0lGxB?a+i|#dVx~VMt31EDNSq_;<6Yy&+9@uEQb3YK9NaHZ6KMuSghAM zG_XYejHW~DDUKbwkeQ*>6XjpdAPl$v!Jo>&J@^*}lHV7j-pyT!DH$WMd0k7eo~{hD z`y`8Pbq+Xh7(m3fDA(p>Og(P&Ag_{MdY6H>zp|MY>Rn-%WBa5535E;hV`E^buAVn?PGtDpD6$!)9Hjfwq~G+YAu zje|%(YUT_gbC)1o#9;CbHA@DQi!f`aN8kt|Ex$6R9*MGJX8}%U%t10Eh|^Mp4I4r_ zECtVEhLE94G5bC-tfbPcms8rq9nj|uBey}XOB`NONek9wvWAmxslZnmQBtvxvJ?wF zy3CO!@@gU;#Z)vm6{A)6&q}aDq^t|F%dm2kHimRqhIz%lv1BeaV;^ z(s|A(kr{Q`H+{L-z_ny{$=@RcT7UaS;xD#@^!Y-pW~7KbAUtdc3vG2JKm^2=$atk) z`B9t$#32N@%00e2aKXS;j5R2o4{(9N>D8c0dkO@O##e!(iG~B`51b;+6{{ozOk~Uo zk!M0;$+Q(qF`2(CEaJj%$Rx2UE zo%?`pIB+>1;9`I~@c}LoxGWWit_?q$a5|6&KOoEmZr=yE9N;o5;V36=0G9#w(;*)z zOvB(dD;5t}Lc)g6A@x>5D)Z-%9xLI#HJ40V2{t^LTY}XnNJv$Zu~IZMp%(JvO6aAH z7Q(E;;6gi(bXbK}5IwI%-c9>TH!^>f*i3f^?lU&YS|z$OGv<>=tI!PwEXKiGb+>02 z8l^7E3XQ2wf-{NeTO)c&i8?w9G`Wp1X_dijl_=<~CFCtZ-yD!i>a9lpw982E)krse z8I@q9>yA&eV|(G=GIDV>B-D2~F|9#$hAk&esTs4J^j;&@>riDy$=@s5l$(iTF~VH_ zPF@ra16NuJM_Xeia3#yh&NU)VRU?kdRa^)^GjJm4=#T1h?KsFMaLPhUpvkd2%0lCI z2i^~O7fOg9#rXgi_yH~uxX?;CMUH`sSW!ZR7}Vb6wW8J~8WegZ9(DAc0*a^=Wbs-$ z+=Cy6&dSdN|0m$5P5Xa*oezA~)BpHycP%TEyLT(KrmjpT$(3Y$hAWfFT^?IYKjB|NZ8j{GT`Q=CJxgUkJz8U5CgbbT3fD5Zt{!>gGFhzUb451Pvyy16 zlg@f30KqyL{}k~RJt{<{_&vsn}TkE{%t6WOptDVR({VHL6pWV%*O7wCPAvP7S&YoZo6 zH*W0xC;qf-EwV+({<`GaJlk}#sSe>^)gXY!+P}!GJiHs3XN7#f64hIW+h4n{>^TIE z?9c&ALKToTAUmWUI*|p)d@E$qDhf8IUS_YNqVf_PBHmTBVNPlh4w=}5M`t~+=kri< zH~2=*Ws^D<#H|vyqqQ4{KE=9~d=DY-%w}{B ztY&o1StSQ?ALCy2-}%kEomxhAt`3wxU4})-Hm;H>YX~RT8&c6 z5Ah;Dlou<{#CWFOE73K&Re2@rX*9P3KYI@^8T!i&PqU6#x0(+({vv!v3rpAN@i)<% z{qQusS?d}Z(Lirj)*v$)2+`jl^BZVqmS^OZ2HIENGxAjfThK+%@X12*0A=7a*lV!g zsN2e5nS|Zk^o-1ShTeM5GqU^{dQJDUymy_P@;mjR@^{ZPg88qX4q3h z*uO|-IdMGw*t2-5dLgX7PNwkZXERPW_gyC=o+FQG>%wYja;b9Da~a2-QqI>KV<->( zXW^gsZu)R!hsvq|StBwxvR9I=z4^He+mIM`-Ph?qrnDwY6}{Ga8MroMy1DKpnYETu zXnjc*uVs(U>yxc(sT;@3#3g$#LO&w6u}(YT-xnA`ZLBIyJ;?_r|+Ke9?>c4U8T z#Wl$C|3_AjEblKeE8+PP;mP`c(|BCC{k3K9!L8yi{&oCuWaWR6=}cPgrORrRfzLBQ zH8#q)=b1`pzxJnM>Ub*fllz)1#!o*F=E{T_RY=Mt@H{<0`i7*3I37y=FkA1VE7>3; zUZBK68)TB^*w^L87xX0Oby*0e)*?GpOFEGO9nR~rLqE6sb?JN|<3@A# zM!9Mo;T3L_d)J|Fr5pKdMvwM`Wxck?u+L4}zsGJactZ}Z)1&YkGUCOIS*dMr==o{l zsKWbVM!q@xhU|I~Jzua%MyyAhhQ1Y6Z|mL3LE>DWVK=+qmNn}$#*Om7&AgX1>CL{P zhuG|6y8IqJoDETUUSj*9dp$=i`TnqaN$(dAmUCaCy)?WdvtL4EoLgnhOX&Zecf%YL zCtgbU`6b%j&~5aZ$u+}?KIX+~+bM&j{U#sh>+RTw%FZZmk$;^fbx7OD;qPiPHe#J$M2s#Ht*A1 zIWTQ9{kAUqCfoxbvMS@@0VXJ4ViDE>%xXnEyF()kM8wWb|mwrdkF9(Z2OC^Bb$EW2K1k<<~8 ziH%fn$xfNBS-Vr_HS%23PMvUeH+ISv&D}d?S0fqU_^Awdjqv@gGXAxUQ7ILlh2`qk zILnxe;i1>`rUr(Wv|;S$Vfo@U3VAIb_^eEJ&6yk65y|`_ET?Y3M?HqCwIPh5dIMfv zZDHk>Yd<2(8?{$|TUb8DA*BOD&jxlx;uwy3J>#Sl>z844`P(KQuf#BO6ZMoP>u}vsv zDfZvBeYoVj$vIN?4`I3JO@iwCBPH-A3CP5-_1U_n<=$*KWQyX#PXoF$FY|tJN|8(Glr*F_k`s?`iTV?j(#iSjNu)7{wNq0 z^0dpurh)Tcbl*Rdk+<}&6^81!NZ29_&%H&o%@{s>i-dLj%oj$<2Ko0=y6}^>@I$ou zzlPPdvieM8(6ZHRnqIMmt)=oVK3Pi60zF$YY{LTB4|U&q2HCclB^a`zq_%Wy$;de- zi2XR+^q-!yqVi-QeYro*$ zuzJ!gP<6_%5?qMa@kR_Ex967sbJ2KK$+$YR1z?kH3~P7HXwV=L9hH;S@~ zCq|a`i6oC668jJTNX&(GdUoU6#qYbET13M+u$9I>)GS%=(u*!I%T(CUZ;q%vW?8(6 z<;bdc*)Awam9O5VHz@5V{hH_vn)=DOCc5*gBO)B%C7Q3Ri79=@5wZ;V5NBHC@TrBK z>9*CfTaYH7>t}+vHOS5~PD@8_%Q(ZF-(RlUM%bSIGJhLg_{ISdUXe=&ga%2`_6(ajJW%Fs=lR^D<@xPoqTp!RwVk11!O=4CJ$g8k zMaF9uS>&qs@Mq&LdTn21$g|q+H{_G|@b4TX=4RySnKBP1wqAeCC*rsWKaxEYvHhBVHw6BDs^ zNcqhjF3Ub- znYnp*PhM^X2*-%-Une$)+6{XAc=WDsp)DQ;)srs~9pz z$$*ck;PO#&_Q%BO870?YPc4skVn4^O=Ir7QJB)C0Cq93zO63>Zn z-1?h-CCe5dqQD6)k zWVhzp$=utE-7GI>F@gVJClji&`A;Xxm1@n@beUJiaj!L*$&V?NCoUG1F^Nxn2xFwk}d-NgAvUX$o1 z^h~w#X(lDOC%O^rT2_Os1KA<1(Hd5dY!9-tFl%k8Bm~{PHX^s25mC4QAuoK$T(~gh z!M^76iRiuU?_O}ENk0BGqrX0L|LRi;sUFW8W$$-%t0P+g`Lg$xQB}G~@&Ky#U{m{Bo*Ji;ptT)b*NuN=hzO&`_&j_m|UzWpE z(>W34k=w4NPiy}yV{B?6cHRosGfnH~yaqJ)ya@YA33u1$tQT9)la-%SaJhE*;B&&U z+ofCE8|^aU3wC)s>~iB5$TQEE#b1z@!t-@kmlz3JzaV@cc2Clt-9~t^^JPjK>F`d9 zsDH@ican}pZS1_ZVE-{W`uMS%Yc7zl+6Zr5L4;$Y#I8s7m&jWRWa5|XsDumng?Zv1 zd!cOjlJvMPlUld^top{+~^H+42nT4|JE7Gy( z;s_^|iPwwm*+!b=e2ruNr84HXJkSUsV zSI8{Q<}2ipZ%CZIIHI?q^yc2qZ+I!8wpcp9A-uL?$=XFczANRrUBsJql{~VGrkZzk zgdbRBu|GsU-Nmr$!+xt|=pjA*TMDn^Y8m}4g_kp3W_(NGR!xtn&3X*UmVbXsYl&SW zA0Rj9&yeW1IzPOG5u{sboGCLjcROWe5ci?i$_GL6Q+cg)YWl7f<2(HCxmM1GskX9+ z8YUTQQRw@=<8|DjH^_4B?z=&@e#i5V-6);kY2~?5CVo$Pif@wH-}79{O%Y{Dj%khG zbC$SZmUJRF=ieej{=;*=TV>LJu;-2wxz(u(KE4(+F7 zjzl$EZszHa`3@?3kTJn{o^+wYZEe!yR*TXx~k z>~@R!M;fR70U7@z^2!HfhPF?bFN=S~f9ZVLpjr8#?9%LbQ2PBuec2a8R3!P<*=0ZJ zmv|nL+keU!m*skda?_dTFQ0!KUuWHnY!l~>rA$sf)t);RNt8fF+5Z($Q?$=RYqP39 z-oWp_xr=N>`Vp1Gn&Enffz<sQ>%m&$U@)}^vV%QKhBu3s7Pz01Vtpa9B6&W5QziSSl% zV!}MPgGGE^oowphxym}()xk^d1?T~ z)cJTywrjdqa<>@i>$6JF059*N^-t)c^`E**rgmkF&k7(wA>whaj;I$Cz2UNtciS81P?LFp^0#~<;}7Wk zeJayos=HOs_Y&_K)*Z;mnYJ7IK=eod>9*SX0sX`O$o3#>{~uW#S=(P^T1PD(@_efd z>?Qxw2p}UsttJ zspY@x(cl3KlknxgXG~4qqyCtdWyui-8GYxPqWUu-eTa17K@O3E*!i+a+aJJgF76{M z4{{`!pBm+GDe=umref49>nBr{-X!c7ax)0fT$ zO~%so#+V@+2SpE`RU|sh085}jOFOkw=6^fccH)#N+;o4T`>e#f z!*|pDzndJD7vLToD)Y^RRW(eW*EF%}N-;)A$ZYWY(g{nJSW=8Mb4iwD;bN}IlCx9r z>HpKQueek2-jO8>wU6xKi7@)Le99u`4|51}Wx_nl>`gFB#`ZPRvRZ~m|7+(VpR_t! zQLN!IwJ*gvVWceXYw$BqqvVsmWL}P!XkTM|YTNNq{r+nQI~W(G8fTg3jF$PS#&~o4 zXxWfzoSVA&6b@m0d$e#YskxhF;W1cV*aY`M9gF*0@In5Y_=+-O|e&zN9daI);` zM6sTdM1{C_{c?o}8X$j5X(um$TC-4%c|O zF3n)MJwX<0d;J92fITI2MpUZHjKN3gTIl;F|4oeQH`nzbgw(2OQO+h39qKDb8sk!(S40!<0~1aB zNTS({{X^Yh@CH#jm04UYQ`5=U+G4pk-I$$PH6zMSbmCR2l%tGeQ!kzwRprUe%>1KB z{-T+3)lsCYz$x>NGREmC>iVNdSHaEFc@*KqZkDWpq_=XGoI4PE`7JVQAoUi$MOJFL ztwOdAy8IPodkQtdsV> zkTG6*jwW6Cw@3M)E737duo#8r-M7m;i*Z$IOJ!7Dlzdrip9Q^Xnk(ZnjM4oz-yc;! z50HVkpdfc;paA(bvOR+YxED*e=7c3O!XP6Ho{$*^iej&oR}3=ftCjB!V{&Tl(kR~y zC3@(BK@`OKOJ&9&<7{(`*~=hfdTRcvsD8U%fA%vdQ%C%?OwBZ|GH1Ua&t@89QZ4JE z$!~`S%P*P6h}5bVqpTIk@Nmf;jDps#ml=bJC;O$SnwESeZQEeuO!JUe_*uzgaU79lwCt; zXndYJ>KLt9Z_45<7B0?XjI`A9H=|4`601`8F-Ea@-COdx_B3IO96W|VJPBq={!n^~ zjx92Is4>yp`i{&SN{LnSI^|G2r@t#d4y7b*O=1k=p}9>maTxM&lgt=~zwkC$In3Z| ztL?H$Gkd#yKFk<1$-Ox-8Ps z^q!XU;#ae*&q6ig%~1}45=~>oa8%OzflM22%pT%si5}jrP3#K(%51(xJ|9jP!H>k8 zjrvvXNHiP$SP$B}Hvf`C^1|Qnl^N zKMIIbsKgR#PI3vQ2Z5X}jvD_D_X6BK|A%`S?ta{h2ulwzi9w|LFM~*zsjn&YWmFy< zsT28CROb9l-{Bu=q@`40`*5UjQcCvMQR&eJYkO3Vw;GqHWPcNtyRAsv7@pRK3Bjm* zY$c4%7<#NU;dn60i*t#&3vafj$%NyK2`M$-Mdg;`@IUtZsPNs@DJd-&-aF1X;TU}- z)uEN5o|E_O=D+_$C1n)-UKq!j-^zl=jl`j1h0ZWQG@^rxtNt-Xf0 zp=ytwlMR()k2j7_@$QMr1;^u~=w}jrym4G=6*n*)pWJ_V<9K6aO8KumibF~R26a54 zF6xNNi8;nesljkm?MUutaA=(-t{fvTwImj0ckQ^V=nLM2mLAo|~I4&%zYjQ9W|%le|?QjxxF)CEcUxQR|QD;X6LQCmbYWPo(FsKe~r6MG|L_ zH=Jl(l4{H7QEAD=?Z+pQDAN$>K9NA#5FU|hj82(*Oi$uHIw{Si(bpd%H|CPWb;rm; zn3_JcN8PgYHgfcCE)VCAkRNk-m=|b`ljyCoNA_^=lQ?h&BtIZc8KWyRw}*omKDrw&g<}kMD^KZBH~i5(jWO(I>#4GJ z46U-{^d8QL&Y#78O8=9MGgF;s^e_Wt(!i&R+P?NoZBKrQbFz`sFFmhEts5YV3rThu zfw^juWQ`>t(?vb%M0xqB#K}*pTr<`?ycg%j6SnuPBo!_S8_BuKdK(?Cf#Ih#@sM^c;S3OLsS} zgLK$8$n0@ETX&-@hp81e^{A_ppZz+YL#uX=GxCO2&ieB~J&n>oYV1Z<_Lt4#%TA>@ zeYfK}lOg}pNH%~SCo8!00_EYJT`ray?r=lpqTYH%I9%_gEBi+c&Yv;(A(~Q~XjV@V!8uPNkdwV!6 zWJ7qk#7?8m%kPt{@eDh=AC?*8iM0L^S*W@D5m`T;QnEZMpN!|Zz#^UIgvy_Gx^cQW zo9{bMH^%g{zt*F?sYzXV>2!3Xep3(Qraq*Tj?)=8{aa+z1Y=U_=0J}cm|VHen}DL$ zej+O;5JB)0**d|Pn9}xHkMvHU4Yz*LBO}i+&Pc8QvWKnM#1!(*Gng`d{uSDh3}O8l zMpo+7Z+h4UPHe;n&oHu60=s%donc&_8u+n?MOk9=y<(z~pHlNvkGwFEv{`@cQ8SZo zwCtH^SW^l+dgRD6b(hoC!}~8N<{&9N6NTQow}-Ez63a*TnFQU`-6N~cG%ijp>FH6? zMAa?JGjdah#(H?Kf!)|+q#(~2no<_);i?DY%6={BF=b~e`XF6QYo3v(_lF0bMSav4 z$mFxopa6G$osvvS%~?jax#e1U7P;AUy?mmXStbYZpIXEnUj6@a=6N<#m%g)P@!49# zZ;>C*rmiw8WZ>Tz%yw7ERLz{(viNTdsHI$Hc29Cjl9F#+JEoLN$o{(coSWXqRDz84 zO5)GP{~lzQ-6o6k=_EY3El=K7@NGVN8@fHFR_jwtrq1UW*{S7~+(VZjlQ~{ruaxrmdfh7;g-dmo6Zrl8@w<3y9Jgi1AiRVlIDjfpJP|^G@!6 zOP*`E_#EAw`H3tpARo1@a?7f!Y!E9n!}Xz&$lN6(2d75bA%9$wiYH+AY7t>|5m&zlBbTfGk+`W`kou3LRD&N0l z)WycM)XiLh#FsCJze#!GV)St=7c%{nTr2IknEDUAB<3m9_wJWuoMv;QOrJuzxwz75 zVRDUBKZSB_eoZz_q5YWH47!94_5$uu`!@NO`xTdvx{eR_tLsK3E9{9&=pF|AwqMOp zt{uL>Lu%1pE^kU!l%o1x@?)k=&{nh0IE+y#W!(p4BV zR7AMeNp}@l^6KTrSt*&(1JZRlx>47AK!#6aq83+#{ikwYKaJY2G|S3qxq?kdk;^o^_(P|ADS9^x@29=^p_?u|Q=Zq1ohe_H z7?X#2^Lo{-I&%K>?n~ZDeN1s=UCA}x)W74m0iFIknr1(zml<_p@9_J->zVg?k~N)@ z+-&beOOxTB`w!*_ZTzs8)-Pllz#;FOpyWfm(R^6>C0OK8IQAX{F3Ng8X_l z!<=4QmX|U)$i1{z4}-}Eyvuvp0!yrCr(Hv% z_D}0o4-81Q^2e`XY>$=p@~I$at-QE(4b8BxQwGjJ-D|ItsWXg8DXtrP<>M%A%Qu6W zn)@dCWCq3EbdwyMp^L7Zs}qdL=AxVBu9 zuwRFoHMwNQb>w60?egq(pga6M^gyIaOwPZ|oUWa{<0bGla+U(dY0 z`Cj?pdWwA`zd3!De%p7r3@u}(#g{14%5*tZ%e*q4Z>^Sv_&!nrsgf` z<;W~?eUSSm=96tpdexHTYSn)es%CqFU;0X_cl;*Qmm|9&<dwKsXu`7A?&1hcJb29H{ z(lueNtkkSrD_d@+Om;sn2Q`adlyhep(}p=-`m@>S+2Y!>`m-`C3lFY+a+|7MolU+Qg zkEo0anq_Q5FW00bw*UWL!Jro0$PEaL-bHWFkI4f|$@`89-6d@9Wigt#xn|_82(3tsFgzIT%S=MlTlVc5_X{r!92bD1E$Bj(v?Qf-rr(;T`@ zrq8By^0vvnnkCz0`D`?E_cqxw8;xDJU81wu`*pu3ljh)`jjh=*wXV6BTfdSw*3Chq zsy>jf<}kiEKNPbI&5wO3Q(S~o+#<7Gv}`^VUg;wIaEolwa>qyVy^HW#KazpB8KcJ7 zcJv-zp<1Yg!t-fW$o{(dQi?2ZhfKeXk_+HANo&y%S#g^&%-pd#m=dF)HZcB%S1QKZek^s*D2 zum`lg?blvC?bnUNe3y}x67D31yNq*Eio1Jd=3Ru~@9tG!CVT4V@sU~)?p1Fjqx#`4 zqV9HV`Lb)Vua>|@&9ORB1vKWD~!rMikmXB$8Ey+tlBpd@~-A~_{} z;%a}gEKj<}7;ers%jA2|Gq*X;H+G5fWXU~DThsf+)nmzeYrO}}uQ(#Eu1fa2mU|6b ziaRYXU6&H-%zKSX^_?#3?`5OBIX$laE#D2{7;nsd^t0K9+;|__=QZT<`_QJLgW}Al z6FUK4-bd`*g%rholpFcEzuRVvTuKdt3)o^3;RsI}<0EmN5l4wr%Rh&wl1&Ynj) z8?$A`JY!;3hc&J))(^A$YAzVqS8Kcv|2fCW`gz8z)T*2~-_R!X^sH)XCwFw56Q7+A zqHecW(|=Wrk!Pz3uWhV+P_3sIr^J~GCtlx9a~swn<@`2zp$?xvyZ-0sV`>;D=eqT@ zh@T=KuQw`r)w>_F`PFXYvQ&#b&V55vK$aZ+02y6imvIl!5jyx?a0X|65!h`HkPyoS zaavYF@1K8wgcMJfUD{u8a-2P>aJ;O8oGL z2n)%%^@c=+Zr24Cp4P`?nIhf#xymUr;z7EtFjv_;$g*ngRC)G6!U<269S<7ko>h58 z{P0J8y7F?0`*%TxPO*xDAO0{7I|e{_%Zki9W&J~xQ`MdF)kAt& zai^GTP>AfiWVB|@T{5Kx_px`&U7G&8WsR1%RLOSDl6&Nr8lJP=E5^ex?LImCVPnj& z_WR=h^~wL-zxOfiK{g_}rSQbV#<8aa=Ee_Cz_ju=}(56P}ajq$?*55*6ETv<$7a!dP|+L8UP!_1Nk z7Eyi+YGmFb@*S^{XBV+SoBeQH-J=uFpVq~8o~e0QzFI{6WiN{Jo@_!7M*IssiTx|i zzCpr14ZFG0!$cT$E`|tSS%OUmBX6+1$71`OccQM0|e=)mSP0vf`Vh#*jUeKzV=pU|JLXnJp zQI;=Z9qd`cwtZc~!j&iMWy=!ECFdnMxI}kqFUbfG_Oh2{hGy}rveHBTavS4p8J(}U zyuSBPAKuq=Uy!h8J*MmDb)B!oMyBI2BiCH`hTQuYyQ`i}@x&tWh}LWAi4R%ASbT>p znb*AI~3>X+5j?<9YN$ ze~7j6aq6dVGwc3jU)1_I6I08UxOy`=DNKKYvAZ7o;H3VY`-G7-q~`5ETVE9}6PF#&k7W~gopsRITwXP=Cbtl!*kG3_%xu_v=xyX~X zY-ZS+6Ab6o>CL_G<8n(K zyO*)=b+4FM!f&cGrlgGhA+EnnA1v8V5<)(<6Q9%pW4Hyw5fwN0G1Vn4Z2DtZ@}zOx z@s=OshhC=n%CfwVlVNG-BYE z@ucBAZOH76%gmKLS&!i%ZODwr<)xLz{m1nE{m*_@?{^f;Vf_9*E*GrAR}F^5o{@VM zIj_S;Z0smwAcDb1>IYhRKnp%iJa8~By(B1QH-?d3(q7-^ci!1z%uAfVbDrp>*KF_m zJKs6eUjyh`#rgjm?g)rW|&^rUnZ|+xCr-` zd8_FT`wozmnmGex3mjHC;P=BzSx53_&$KmkeYSKNzlNd5ca&VWhAz~7 zw5(ZUj6QAS(Z8#cct%e>_|iWyC)|BI-5v734vR5lEyybM+xpBkpQgJSYWZD%kEOrs zDtOw+zR-<*gbpjw+}LNoIypts)0_o&kS5$u{;PXg(!Cs61MVK&Z%a-${7)N2=7vEs zu)!E}c}eE)hcByMXg+^n`hzKa2$~%3N-J4y;l6wTfm}4uW1jk8YS}-{52Y;ZYrcxt z&$jVN#rYiR*m17GZ^ww=Rf?SNW-~I-U|5f6LAG(!@49sl>(_y$Wt0qghF5Tg-r0xA z86SvaF2L+OL1sT=jOtf`**;p9J!6ciE_>vlT5$v$Cn=rEi`mk*Q~99{x?wR4LMIGE zPgf4!5ZXWyL+k6*4fI2+kn)Y4yo!b$dZ29+5ox|j0ChyT1s&0Rw^KDj z7i@$XnifHqhL?XU{Ew{=#lIt)Gp8({#pz#t4l|NA6lIRSjosS2PQ zmO;;lBnUcMcn-QgBBB*U_%S(#ft{VI725Vv1@-vrrV3!Fr&HBHdyEWgnyR~0I|f^y zE;aNiA~JWWV(5XDFktCY4O%{^OSMA#kS;ZLB?-&wQZDE?wo5f?TDw#jI$_!>o`bnC zXyd03UQ z@dXzK_lsT13tg{usQ|RU&W|!c{~JWuKtylh58B@*2Qc_9Ie^}#E)|EaZC%Rt3_1Ov zOO-?4M_tOV+18~((EW9n%6S%hdzUJL!Ed@$)w8sJ$1aK(fhkDJw8Ne*RrnkM@@o!@ zG`qXhW@rx+5DdYbwd5p1&Y(9+&R_@zp{s`~_&4FjNDvI|rz)YJpL_5_CqK84_B@KO@!gO`&)Xqy&N z3!r}{zv|G8!E^&2q4icgzDy3e=f4Eno**aCDBn-YtSSFyhsQbo|Zo}5C@J0uJ`w}zCdk>}qHRr71ET6sdM4BDDQsuo(mrHG*$ z#-KkKQe$5uq2G}(=zt5Lli$2=hnAm1Dsuz=e+j7q82UA&Dxu}Kkn(DFhEx!`qWoIJ z>$t;GXz8gYWf**l-b$+G0E7QrZur-N3-+o8ZNF%* z+5_EOj6ZZU5yCv^aqLwtXq|>X=z5gf&!KY(0l$U2XRjIyeUEXcxwb!nKWKXvf7%Yi z(Ej#bm9vHR?|X+FWAJU=tE!;)-My+&a~mFE2o8Olh_>%lMKB0m(DoiBq~&lO^uqwO zHuId8!yG@)!x9*LpI;P!{tx!57HImA4EkyRP7HDF0b1W7CoPl+^ucmy`-q63A2vbv z4t^>CIzHa3a<}3S7D4CEy=o(LbANmY`nmo+_g&n32uG9a-`BzrY=O@G_Viu%kKoJ?J3=Ak7io88oG^~!F1 z9@njkG;_E)9lB5KR?RTL?eH<^I<;HnR=-CAPVZJF&~s+Ds)v?yyH$&pb7gy4GYPYI zt3v3R)U9fu^@48Iq*>6dOz#uWh26>y{a2767~rb*2F$LB*kbU+`h zgh5!V-R~d)+8s7ScV)K79_AFP6byQ;g@A`C$UUTC?STkoMA2B8Cn zp&Mof2&jrWg#qY#4b|23y;wHEgoSgNP?hgKds?g++h*4_EN{t3pYZO zqJx2MXbs|S;)ZwVhgHzoXP;_-Ci6a39l+qi5Q9EA^gBGlu`mGb(3-MO6~iE$tI3ZO zG(%hJKGhDLFb)II@;wQKIncw86cj_t5j+QNX+*4f92ai?hm0P#uUh3|2um;-r#Q`6*PNW31{7eG;iGcYL z0ULCk#ZM)`;5qx08#>M-BhYH!r&^&K#-XkH{C%om4@ER-pIQW+7wl6Tp%3nct^)2; z{~3=L?&CL*Nzi0+sO_*y+b<%A&<>km@M5a)7b2QMxX^S7zaju#a2*W7HW;`Rf4|cH zT~kpG4E8Gs2zsw1V$JE)VFxLlxlfhBP}x4^ho+nNslLAv0UQdQFb{fR3G~AX7=UgV zgtagP*Fw{*eQGnb!S>(u_>Uouzyq^82>|9p$1PL^bix|wg7wf1eb57&pcl45AMAjB zXzC&XFcSu0E)2nZXsw_GwH?-URTIFi1c<;tn-pmW7>6Eya3L>5)wuSla_EB%(0&`r z2u-(B0(*%FI$($&QdkRJ=#;6O@Z6Lv48bxOm`{XS{@^~<2;B>auNs5(pVaL>B7l`J z1Z$yfAs%56wrM%+fTo9tAWXtwCUikNbi-okh2_u>-7pAip{a%hLMz-19k99`gA+pt z2B9TF#1B*V&k6Q2;DTDUMQ9|g1F&KaoVkC^;ns7o(?LOs&R@kWJuo*gFJ9NVs z^uqN0lmxWG0Gt4Wun=08QdboiY)dIBXot1X3D-i?GTN(lhaoM8=?5qQm<#=|00v+Q zGzlFKv_cQG!v^St8=)TtVGxF)sk)Aiqn8}PY-or1&{%GzScJ=!RanPIDCzK#!NP z;2;TFy-zKGwx?-!&*zZBDCK#Ph@c<(pz|e)Udv%$vr=9-6gqwA0(8Sl=z0ZZgaH`Ve3gh&2)B`{ zfx*`a51KZTAZUecDb-5ZFmxcWLsMVH`i~-lK9~pnum}dA69!==bi9GCXgk~tO`Avn zbiuHe!=b78dy^`Ke&~d@&14XI;o52prnl%&pc8gLKTPjOgwP6ua00Y%p?zyR+zf4R zQ>V}gCdW zf^N7Q24LT#3HLiXIB5H>ng_JuKco~ocheDR`wtW`()YFKTH@PHOTFSJK# z9-1+f2D19YQ#;1Uh~vr`itJLia(G99k4rcnsmf z!eeOvob{+m1YWof24EAk^a-ooS`O2PQk3SfvOxzdg+VU3tJn54ehomo_vgmvVLS)( zp%Yd?4_pL8un{^2gjE|1!t^ZN{*NT!EOMM4RwXbv5RcG%G#;VL5>}Ss6k$eK6++wK zF#A3P1UJIa5U$*Z)?r-ppN%~$tQ^okJghv>mW@B?f^E=rY*_U@mV}MqTF`0?ehh`s zJCZvvq1{RXpbzfR^5eN{a0Dfj6IK<_c|us#Li^}2&k`YwL(hqnz(~S_`OuLYR^`xi zQdn(q!I%OQ9dSprr_9gihEDO_zpM41PbADi}i* zUWR|@hI8Q}2f79QFbJ!imxtAalL=^ASk=G)^h5s@By249Vy@bRma9-s=!I*c`)W!^ z+e@g5Q{c?7Dub?T(R%2-j*`)G*am~J1BRe!93^x;;g3_*%7?*@z*2^G!vJiAp&P@h z9j>{F2Tmoxaw3AZn<*jafNP;=77=Lo3jEn{zm=kge&~XB7fK3)uY>BO0*dIpbxt5BtjU3>8F$6yT|~vR*?bdg7sPsTcP6~ zbZ7$M!$Ro0msSK#_mPomZJ0~ZKnF}egB0FRYk(${vjT?Z(=$TH0s@3S7=xaL)bT|8 zJ%m4Kd6)p92e!i?G@Xh35jq;^fDY(|E*OB-9t_q;$r*IP0QAEUbS|Qf@<;$IhR%PX z1JJz~WrLq3tO;DCmJ+X!4Q(XkASX^T{d9g}<%AKlD6J6+l}9B?bd94uj9( zehwL2OKX767bxPj7%c0^xpsgt?eHQwI+vn`6QE-~DTdCM(G?hg8=>VDbOBmnhjxeQ z=i&cVIwt6Dq$FSv`k?nUN&-4JQL=Ve?We9WgkU)gzC+Ik?OQ1kXnPlBgl?F9J_&5X zKXh`Rbp;HVfF8?J?(&v~vuRWrAv zgSn6tLKifBMF#@CaI?05jW$CUG)>0d&M*s|umD=WK{udf7rFu6uvxo*OBG&3iNOMB z`ksVC>wkzCT6Uvjh1H}4LvbMvKaf-CfVI#EH$uyg6fN{a)5X|-qB+4JEP)U2A&2l+ z*a-bU(`CaDG)*DgUq}daz&z;w75~uEK}l3&cnd>FJN!n@E+M6zbQI79D`5bxg}vPr zIrQ$Mj*FC9eE@$j5GUc#|2qkQ7DW!>Z!r5(!ZAgZ16un;)Ld=v%e8LMlp10EHDH+@T%1kBq2t=!b6TNROzs(0f!w1)zx=dQA?( zhlS92G*@LphlN`iq1O_rR=c$zBcjqSr^w(0Xde_&mCy$lX*pa61DPZU1_zTMEguq5 z1=H|1lmtV|u!yRMR<5`Wz#z=I0(WkcoeRCkkw9q6iKuoM%BjXfF%I0~S_*yK+FA!a zx!mpsT_*Mn#!f|(0R5i~u7mOBZk1`ja!Fc-c;-=h%~f`LUS-L;TwChMU8F`k3&$H~Zb z$e)O)63to?4(&@xxTcT*=!e3wveoI2e}5kvPUR1LI#$_?Jo-b!+~~OVjG0FDI(nUz70Dp z&^$S+YG80|RQaI&6fRHG^6FE$tn7AD!mVg!(8_ISHQEl>LH~rP+O6$pL{-imM9Af4 zMbLLK1UR2dXKWu`gv!kkSB?*OA=!E$&1k0iQZ-fs$a4oD3VQ9wS&yT7ww4M`H znRime+>TZOgXcxnM(CczZE1HAG0caK0s@4V3n^L{EaZ+KXySUb^1F!`E`om80GuB*Qgk0FEX5;q!@1CT4Rr)0e{eOBMFAKn@9k3m!pK$^GMmv zc!ZW&6bZE6LeW9*9h3-k-A#^2sPmr$1Z@uy5De5%QZV!|5xPmhBP0-dV3FpdQRRi9 zMflV9f8p-|!dr}TLm#Xz!C+lNj-lf*luq*rn$LV9tc$7=XnT???=+X=4qdPXdSM5& zt{~tCiC{GmL-!h#69(Wq=zJP~&(O3Z&6eWao>V2K-2qC<%YhGaEHOqsC(`H4bMMB!gf(bFz^EjfW9AbhjuQaO0OY7 z9h3kJbW~I47~H?19MI7jRetD#t-*vsY2+3yP^F6{lcRtJ1l|r>Ryyb3*tnyhyeTbC_4-t(WBhZH>5{3Y56fdssoz1 z1uFAjBoyXpX7#8l7=kU(egSt*!C+yJ%3qB8r9G+!dS>Ddz1Q}rzDp?C>v~i!thRGG zk`sfI3y>CQ-rS=$!>_pKC=M5LSCP%5R6f@cl|j$lJ!&KLR`syn!~a|o0{!>*C>QiR z(4*?$@&|i(8Yv~u4ObhL3P5cGw6 zcJ|kq{U<(4*?1N%g2ObeUqRXekNk6H}GYYmTW#Xzd$QVd(3J z`!eE5i}8{x;lbcCn$IN|!U+5@SIE(T7;n8&MBMn}g`T6h{snpm#?;U{0?3T18t5BL z1kf=g##^ttYGSt zl+_C#h^azopHIm_$0ISdPRn5%bUjLfU_fH3Vl}LbsXAzVGFGiNWAH2|LKuXZYbfFs zF*N}OpcC5bDIpkw>!3GpzY0OuMf*9B!u`_yss?(e?pN*5am9X>-GID!zbc1rSPMOW z-_Ls?RPprvY7Y!euino|6&`Ncud3irupS0n1O&%Ff%~(Hn{)RoC%mJMfZ(N1?pMul z-0J-*?K!1hgZclju5$rzqRJQeX&Xu?X_;iI6f6+h;sO>4u8*QThPEm`H;bYbs{{qb zrB|bZcGq2!Z@2EUS}`g*l`Y)BMXhetqIFvlG0Ve6tZWynq85#aYn7rAQA#HD+}}x) zOYisHJKy)?_djROoH^&rBcYu+#p0mkE`ROP-^m-%w8Zfw*kK#zS%0gSEAB zY5bLX*2QHVZVAWbZtUN{_R1UM^6F;lZHmhv4zx7Kc}kU*z8IIb=PA${mz6mDN?g`r zcUxR`sC?XuBiQi*JHQjMyPfhX{c2p+VB?Lr+=;z^kIR_K-yW9*FVgTGl*8KFaUN17 zAGf|J12WPPm-ZG4?2XIIaj=UWZXIL9O^04NzuAoMs&Uk(qH`p`PO$Dje#I^Ux~wo#2?`a6HblMKr*Q%VPn8mVSEl zHyVJw*e#gL0f#d!*{;$yi%0Ysks+4!v{5eGl0F=@TQZEpIm|hMJ*Tj~oe?aszH8iYG>7YC21juOyH{H>ihWqTMfr#6 zh;oQ?340!)UL3_;IQ%H(_Q(LwX3TE^I&j9Z>yh_T0{AdX_&+w7o$3UL_E#oj0B zIaV{HG~gI+#$qiE!V%nqwZG884$9$5Y&3GRVjr&Q2(Y8|Y^YY?2o7)HTzH29Pf;=U zK21Y#_yzL!Qhu`~>#)C-V~b<2a8mx0hPQLF;_$2N7)Q6T<4($NwWJpZ-(WlJ-oe?A zBe*X>5Z*<_|6+%`X%W`m;uzr|?!saZm*qYh^0p;sV9z_8WH`K+k-{x_Cl>$YT*BHu z8uTtb?&4}vevb}f-}~Gru;(Kh)TWF=D){C9ArZr`_hsfINU=^ zRsL5@d-6UT;%cmY&HVv;aTAW<4wXL4$Y39~eLy|fgMGaWJr3dJSUX|`8Xf1t-WcEXYgY{Y5!hm62amJH&Ubpr0jV8TR8UEG9CA1a6r`c^oRGVV|(yz?1~fdeY)) z37++2M`!VRKKA1#>^(anyK(T`gw#HzVbc@Ri^CUCA&$(VfjCx91F@$v!BY>^b5TP2 za2VHPHT6RW_FTsL&lpK>f=4;I{@qt6UDwY<-KkP5MZjVgMGTX4NfGao|>))vqp z+==alo|r-2zSaOg?i631dKhcKX4G$!Ou>|0My4&(piY{l9J z4grpAq+!_e6eEILo~B{F>=>6}jkye}uz_o_A2;A2ZpLA}6Sv^~ID&hz|Jek`{|EsS z8dPCVGed^G&(lNf!(BLtdvWwd8ukr4+??Ql$W-()4Z{958uBgq?TjS$yw16T-P;&? z!2*_H3;I4%1#GXBzRNv8ny&M}XI_U{^<7RAN-Y4O9%Ftrp zJ}N#+5Ah5f#no87h+l^zn8(2R1N$2`o$FQXEY zCuF#3Fx1DW5SL-&V_J^=xDLC!>8VO%ext?@Fqt|AYX`Voe_;J5Gz^DuHP${&aPy)e zQTB&@0fJ6~2aeId?1^!u*hgr=K~N*pK_#dpLJ6Z>jS{B_n;52KBN1_jCjY41ziWAKrwccqa}W#%`eiAfKRoBtFE6)f8Lq*; zA^ox$NAV%7@g}(~K?S%3M{p&M;ws!}@0Sf&y)C{QhaCNqEE&Td?9T1yNk&;i1U+PgvHd4DaPf*b7J9!7V*^KUbPTVfC!Rh_{Pj^nOR{Tulpv&IIKQ%+l$k#60LU*<%47e><6L&%wA|UcK;R7*36LxTe z0dv6zYH^Dkk$>jmj3DWVMxK*$5K(q%VyR1eETdtJX&iRhi-TkY^H~W|#VJ&HDmw&U zfhKM)P=!xfFr!!#l`tP}ffevaSP#t*y_9XP&_uOY6ZuzZV)uMa)K+PtTSxOJd&zFybFY~NCPTw|{8b)7#ne7A}<(|N0=n+JP&(LJ~#xi;^J z>-=%Sdy;F}!T(L zB7PRXregRGGSAmU=~M=3f+j{0p8rxpJuCq~{1*NM zkHR|G2+zQaum#?Pci=rZ00*H5zJ>20;{r|OK|YLw5-5eUU^>i#i{bJMw1AjTumBdq z61W5Ig_W=h9)}I^EW7|O!{1;l?0^pV01iM890d!8%+Q1m8 zHxk?icR?-u9v*}>@E2GQ8=)CqhHbD5-hobd7v6^tp&LGhBXA7jV4JCl5uigMOn_22 z7iI?dXD<8#u7w-n7N~*c@N1}pKSK!C!85QKwm<~lgbvsTA3+p)p$}}cG~tBPU;<2o znQ$@8gTM{^vjmpG3RnsE!-EilCtw{s4V$4Iw!>TSFZc*PgB~~veIQ}TY)#~X4yVBc zI1_#jWnjQ1v$ci7OK=U`1h>JRuma9Ji^gEJ{2u=aR>LE(225BB>)Vd8r39;`DNK z8WpRWS;mo{nT#|sl~~wp5uJsWPgDxy0$&;;K{m>UMM|akko!B}mMjD7l0N z$^=A^BU15w#9Aso&f##T;(d&YD;0lAoSq5tkLfyt zLhK?xJ?+qw zsr2Ki^g67TO(`c6{8Qr5#B=1erSf-{oADp`%V4{6bjM>k4TYxhBQsBaTt0Bk1XbdT zWMWbl*N6JlFvFK?%faTNq%6YPcu$!~-{;<6EZ7D7AeRk#DU<0#lPr|>`zdDXZcTA>>pPN$dwOQ8XF zIs;CTeK8+&&_jmXMPr}_LQqcrwa`wy40q!~ot;BHw8MSmN3n@tf;Zs+IP#t1Y^Z@I z*bfQ#nsp_ouph_@%;l4tU=5r`ycPGs*{3?ic+wuY0+zul*Z^0vt^vBhG14ivkhd2M z;-%0GJy2NS6b5|3IzMiLE>KIbD=E>IIN{38%9w5D^t+g6Oixawscvq`D`}?rQorkt zz+|rZQGA?@)Wm+`mH083#P`mdvB1^%9*FYQ@D|cB$~{859=4Nb;ytJFHLQ@EKk*;P zGtQu12oi_kVOYkx#pGQ*hC!!%6asECqkQk2Od|}4LU1$}BP$QFqYm(~)5Ty=i2yf5 z*)B*$p>djMg2-6DUt*t|{PFA*8`us}6+@Ku>)Gx%a3@4YYL|&df_hf`7ZrrTxR**U zA)YZrcxDZJHiY6sMGy+w$p3{dHbHP#wuoIgLPRS@2=|;3!grndE9oi@lkhh*}hkZeQ*$7 zap?0iGUw>r5yrFyGjvZzc4HaG_B=B$OP`m$fw(&r|CKnJihn&&W?Pni<+usc`JjjM zKrJ86R?ZaBq;2Y6Gp2`F-R=%!b-ViptItdI1tCa@Vl!yd3o^ecH`m(qBIRw$$I8uv zDQ6EM9WOU0r+g*WmQESiB{i{(w^LJo=F%zV6RHs2qFgh@{3PYf?1A*@%FO6^sd9FO zdAo9M(x;jo*}5Y`H^=1ZBh&HtT;dq{nWHPrpD1j)bM*qVY$!RCk~ycE)hcmjg}HZ_ zJ~DSQ3iAP#`O6CP1*}z7oE(Bok>N1Cc0E69QH91Ir(=#;$1t|3sW4X#XV)H; zxVyrfHB2wcSee|BS*3P*|3KWR)~`toLqsK>O7@38M4M`YosXByJ*h+~iLB4ld-AH( zSE zY<5fYP%$kPdx_6U#zmPIRGJ%yv(fBidbAlGuG`azoKT5fgg+Z9F03^39D30m7gi1) zvRr%P!GW~5Sl#UgmZO}Q=_OMPv2Xf6${ZO~ChPw%!$~>bj5s*G=T(|NIp|W%L6@qN ziK5I!sm!umGH*#Gs#W3+M#@g{pi0zMnyYi^;r*%1-73>ejZv>kY)U1xJQD56Q7+2b zTRHFrc-k3Uyw44MMNLdT*|#C3C*yM|lU~}nz|+g?gW}PH%Ip}F|JI;*@1VGAAXZ-h zg*Y*2IDwu;+uDXA_o8q85HecUi_r%Naf znR8saJDbm-$Y|ZX)uk6@^ErBKl5Re%;`6x=Pu9&g70)D|qMM(lVjJ;Lo%~rkKbX=*^xkKjgkuc3%sq4iA+>~au<{>@FcjGYBlI1J&pAj}Kl;Xff<>u3_(FnkNb z?P0%)VaP%#cC-$6@JMurXa%EtLv*iKvI&RqrWPi_3-?DuxP!qYh!|b~;rdsqwGzT_ z!125tcr|uII2^*f{do8kgg=A)nqm05qtIR$4+&mo_!JU+-pVAnU;-1IaCB>{gQEwX z*xcIL(Jyf@9Byv~RrnG%LY}z@a20H=@-5%-N_ZdN59wd#eh$LX5N?OL--hsL2t%Q^ z>gk;Xp$pW8UoKuKoe)d*#6nZ)ddA~b4lXkA||1|ZFO$1@VD^>m-!kb!I9h|1P zR5ddRaD{Doy=#Q%X4X%gW+&c&BAqeX4SJOT|H{I!KZG~E5*`ZSke2`KpA6xuSHkfS z_Il-tEQN6CD_3MAgs;95{=p<93T8Oq7kMa1Y zQ{jxiAZ+o>`@EAMaq)Rz6ihIYxWG`I*A1$2Kks2SIqtE8!3bKkx8Q z`r#0chHwbhYz&0UAgp3|3WRH33CBWsTKy{#^C6<*m2fJAO&$L!zzSjEmGCzZu6iZ? zcMuNoZYp@;2^B(kQwMA3V9&OHH<*i14zapda=8j&6`CY?>Ojcr<;j2dIvo4T$sT}k z7KBwinUfG+`AYaQgqOFsI`rG{W|hD5-#sX91PrWK!k?gzH`juYhp7xC|{vE6_@`3gz?R7!F(Lz9iDsf%=sI*0x&_TfbI@EgA1y?S z&|>tk%|vK=j3`CR&~mf_eT*8O+nd&ex}sjF8TCPZ(Ev0E4dz1mLI_4IM>Eh8v=mig zb;7Ty0S!aL(P%UVO+lBV8R$wh6HMe8W?{rSbQ8J-%|>(3Tr?joM2pa3^e|e2UPWur zx(1g-+Zokk#C`NJ`WzKr*w0*ts;B`qp{}SK>W+G$X4D4_eUWI-I1D4g(J^QQ8i_`s z(P#`Bi^idI(Rg$Lnt&#uDT%nT9NmIuqd90UnvWKuLL;^eYCuh>E9!>2qh4sD88>`T zUo-#>M1#;^Gz1Mr!_aVa3>tw(qETqH-9#Ytw(qETox8iU57ap+t$UTA6` z{|hi80Zl?v(B)_bx)RMqv(Rp}}pp|G9Dv0)$ zFraRz84W;#(J(XujYi|p1!#(h<9{VatV6TWe6$!XMJv#&Xg&HIRdx0XyP{sGFB*h~ zCgR2zGzyJHH&`>lC4M)eI5ojcugr=a&(F`sh|0^*f6U{=i(Ht}v%|{E-BD5GijFzCKXc=0L zR)DtgUx^V_sGwpKp(<)XO{gpChPtC(s2TM^ebE3kP{r{dgb~4L2pWopq2cHlGy;u8 zqtIwH28~7I(79;5isOF)MkJs~XbQR<%|KV8nP?We4&8)qL9@{uG#AZJ#En9<2rWhr zqa|o5T85UR6=)?|gW+G$X4D7uMFZ?6 z3V|3Aga)G_Xeb(nhNEN92s9FnLZi_bG!~6R=h}qv7mpDO&;&FIO+lBV8R$wh6U{=` zp_|YxXf~RI=5qP?&&P;DvqZMc+T7_OkYtTCMK4=^Nf&*S_R7DM_ z33Wx?P(Tq@V^na& zONpwe0X3nnXrdc#xT9XE8TCPZ(Ev0M4MKy_5Hu7GL&MQAXoTHFArd2^&}cLUjYZ?o zxoAAP08K(u(B)_bx)RMqvuwinTZa*w&@E^-nuF$|`Dh_pgchTR(Gs*2Ekn!E3N9c2 zl^9WlUPWurIM1#;^Gz1Mr z!_aVa3>tw(8gTqaVMH_Msv_yn=tI6H`E>VLd~cT>Wc<|w(%c`5kY7$8j6OY;piAN0*yqY&}cLUjYZ?oxoEr* z$NvJ1NI;X&6f^@}iDsf%=sI*0x&_TfbI@EgA1yTE_%FhUV)QUtf|jCXXgOMeR-#qt zRkQ}JL+jCKXZxkiO?0tutrm0Ujlt1#j!T7%Z1_2_-{G5Q=e zxY{q733Wx?Q8OBXhM{9X+xU;fhzv9f%|(mQarp&95FG#4#N#Eo*a4tc8*F;$B$$Iz(1QrJJB_hm)f=MkA61aD$u$)^@g-q^UDs1B3 zqe2e%FDevr?^EG0_W>1ZxDTmdfC9)PDww&CsSwP4LWMByQwkr*fzPOr7{@oBQ(*zO z0Se?5nFHVAnVJd;d^o7%LQv(xCtLDqo|O@olJ#w+$nIWxYMYxg*%@L+1!ue(s371A(xx@ z2^I4B#;0&CxmGF^aW_(-n0o}SE%zuDO1QsJp_F@r3T0gABjsG^BNbfeBb8j}BUN1J zBUic5M{2mxN9wrHN9w^u1v<%n9s!-?F&8??b1rle0amFBbP^pG`iRPfK4Rbw6$KM_ zm?*e%XNiIvceW_FbLWbJ7xx2EFmn?`!H2t86nwc$MIpes=`cY#DGGr+qD&NmxR*pB zn0r|iLbz8%A(VSn6vDXIL?N78Eed0}HKGvVoVV>nvPAk?JC_fDf4^7m3IG1QJP`hU zuzaw?nCj<^e;G~I@M^2!?0MHndr=}zyw=-+ScnrpAkEbbS@E=I97%5)SIwFj$Oa;^-3Bs;P_2y~&uf-pv+GBg6hz*5{^?NRp%ZzXJ_Rc4z}; zkni+llXiCw@pdGcti?hy3rGcM&JA{dAt@nN?LY$YCOmv~5h*6Cw5^Lt3qn@11Bv7z zS;6v?NMG^>t4|_5$QR7K1VV4J=q1FLe4$yEkRpjJXTz6~?!cmD|uU~8$iqfiew&Z^);CaeEKz6LgKaOUlUhCK4Q-6 z$XfD|wtF4%B;-TwxAkNrA#=4^8_9>14}#0zKxdl8dVfnYc#r$#TWDD7ds7>_h0GxI za5dYrm2`3_vG@1=IoH|ct)!_r$$rVN!C={X7b>}LxT3!gcs zTsHcY{$-3xFRy0bW|NtT_7e;+`u$`(@f&dBdKfyk=I9K0-G&p&v!C{Z`EHivB73$b zC}K0;etBS>(dQn-)xkfYX}`Tu9vGK(er_AOFCec5sNVs89qf(7>%*4sAl@e4P`1{w zHF8olJFtVe1^jcJekvny$S!+?X)|v+W5f;P*mC2zvv4xJV16Z~oy3#(g$_H3cjDi9 z{tAb#wYPTU6-ap*TH%_l#n)m9(BeOp@Di;5S;8sf*iz#-Xw4|NYW$i!Z>;?gPG`nS zURhHf&-B$UHKDxLHj&L2Qi}OpYOCn0*LDK0%gaKbPVDwh(t3z3Um5bX6u`3s9Lw|O zm(Q4T@2Rb^@|N?WuR*}V^M?Iv?fc7~+4iqpUd=}BBD-k)ZT4&z44H~*HYbO);&X9& z4)JiUgyYjKg|?=TgeEhkS3x7^kPGxG+!}V1UI8_b;xFUWD@Gr<2F}|fVIc^~Hrsnm zBn*rz^V?ah_Bk{XT(bwVudHV7d&u}sb+&^|>6tbsUV!O&t-1DjBiz_vd*TIX@iRt0 zYkf6aw}&Lq`_-(~Ui-~0d@p&oRR$bp^r_*ewGuXKvl?t?wK=Pro!m>>^6`IfFBzc^ zzXNAclpcOZ8=4EZ6mzt_IJl@&VHi#P^&WJ4Unq8l(eLv70sJh5>uho!DR;SVOF6-+ zgMQ76fgA_tlXg_S&SvJr%`WE-`!=8WQp0stmQPypcDtERJlgQnw~S1JL-O41hwk#S zyV-o5b=XHdTAOU)NLw%Sv28mKhtXk5cfHQ0?jzpR?K)e!kF4WoVJ;ve9DFf@fa~m& z0up8jw1;;Ef$Y#eVrF*>$bBzt`Sm4m(e6QZfwuO98$Qfo6V^fnr9#p^q3&T&Gvm@@uzrP{ zmKC<*PON~-yEeF%O+G-}dCG+cVAN+r*E&GFn&IU?V_XK$146fAqaQR5J9U8EcFDC> z)RdlY*9x^c^&^?(`WV7zs{S%|hV|5xTBzr)=hf`ZgQTMq?_Wkg82h#kv-BY8VzNao z8}`K3L3cqGhYpff?d=^g3hpSTT`${)H^Q=B_JsV*UmPTF^Xk4`4AsqnRbDYn?+`vz zOnNuBA2g)`GT4muHR>>JV+m7sZk`)OavuaLzNW zTMTt@pxMt@zuEK-tpB>sp&doCUK{%%eV3B8+QE2wQ6x-Tzkm*u$d}rsMRc1^G#eVU z<)2YkSkP(fm(lhz`Aj=xp%)2B(l)N3yB%S5HF7OIM9Bx5aRYTQl4)A2-PDVaw^`pk zbRKzI`)&{2MaWcbaxT53l5twU?`fie4A%}Ep&KbQ_~a5gS|K59_et7DoZ{4={dST* z6U9+Z4eZx5&<1_Ak>xZ(M_yy!R?q|DCrui(X@AfIL|hJA?Jm&Mlyuh~T%;{2@z;!1 z)Kf>gX#Q8}B|T}Y8E?}YB5A{(-lc_p9`$tKQRykk z%S{d1*Z1hRGHI^$dqRT=X~sT%N{0hSp3z6Tca2(zhd4?{ZyB|f?Zhvf=q01(6(IJP z=y{_S-Ba{X>1m^u)K6^E0oLm)-VtZ%$abw^rudx#*K6+_@e&~$v=twS7adw!je;e{ zd_O(Z?3*5MHXnLS`DMNe^-oO@GYMJ6S}hi*!NPsfVzEavlAwKO6+J2WR6Dg=JV43E z+KMc(O-nLG+x?w*NFn3cobSay#2DuWHsS}-O2)EFKZt+Y!o~Z=?c^;jwn&r-c~hHn zK!kPt8|;H0#ot6<=LT)yL9we!!n8TRi0f4{KyxY+H%g?Bw)?Djl8_*6(eEPs(PDS4 z-$k)eA)Q&jYhtY!3s+@J^f5N%{N4U@y=Etpc(ZPMYH~Y~=7r?D|(GB6g z)>QX^K5L@+x$5dg`lyNK<)%9<)4NTyY7bqONN+XKlG^DaWExt_&iUv%xQ5$quoF(f z?d7MBTky3+M6G7(r2EZ@&I!^k_0nw<>BAsxSs&d#LZfO~|6pBf8hu@x6s+s!*eQnJ zs>UCKDbE_oH~mVCOV7byTw8Px_@d5!dod-)U1$4->Dp1(>)Ic~bp0re3)0+%>%J$3 z(x8UMv{2rL89~~`5xOme#$MNwN9x{^TJZZzpR>kMzCrgn zJe;uu8+8$6j^?sSXXr@6nCCB$dYEP`(aqP9!E9-%u7(WK=KQ9+BavWj>KUC*BClyN z6}nhTdTPI&*R66Oowcdgbnv>xTf1{ZS3+s&bnW|Fy1@!*r8V5wU8clMyYxu+h6n?@ z+cVvI9WiMq8+E%yVql9zX{tlYOqejVl!Xa1S+yvcX~s;=pp&kMMAhynQd>%WXJ{@C zQYE2~qTW&ZlSh;pq;)V-v{_9gBZ02C$SC;}+UUggn56#hzB6ps!Q8Dl_1HWUJl21N zg?t2AsLqlvFQk{VR6uBh6T9sqeL zP30Rux0mJ;dc%qJ^oE*knW;s3OXn!P?xY3yNQ-59*-6XwliX!`!HJy+lpfRbPFhJ% zDV);ZomlfA=^gIEAZZl$*B~i`o^xVOy`^@vbSCT8Tk6BZvv@czmaXb7Me^{?-cmn$ z)`@lQBSnB(QXk1m=ou&0EEqBZrw2>pxjzO=-5?|FZm`sh&{IxK?JtFLNA;Jw@*Ru% zODzd4b7ENoASL+g0BH(2$vTHfqk&IDq&7U+S0U0|N=li-U}-Aw-e74^%TgyH`Gr?n zoY@^F3A1roJ@nx8lsrg%WUv%3k`mTu7+iu9_TDgQ04dqNbC@KOBW%}jshk{UNh72T z^0Vd?F104)C)R1C)B~72QkqJ0V%d))rPsKRMoJUNA?=M((lZ??(nh`|4bzcAwtBp@ zmuPHAq_h#Z773vOwt9kegzRIJCQ7?WKGRQSc@so$jg{Jo z+hOB)EM&ZmwR%^Y1}u74DrmXYiQiuir-z%zBtKD%OCQ0jd`t2ksA~9o(rQXJvnz2@ zFHwVt*0ZD~lx$+_=SYcUBWp7k8etA5Hh zO&gg1LMf)DXHytQapv%}2y^T&q2?Gq3tFtLnl@-T3!y2+hj4nm7D??zJw!zElcfo?G)_}fq#Qy%XZup&{L)y@ zrEoFQv^z^-yu9P2*qQ3pUthZ7AhBGp&-AbhSUS`IbTRxTclW_T|kO-vRCu zreuChZ48f?58){O0Q2K2DFx=`!Cy)X$s+CGm(nc)6Z7s&sT-e|FMTDAhKZT^Z;*oM zu{YVf8>H@Z^_y({1{kT+-ehGPU}_A0lRexZy-x4G!3J)WVqyN?xl!5!^Y?^J($9p< z(%O70ty0N!ZSPKLy$tJ*k>5!r3T+*vwfJ6YqtNF8tXDC#`!@mFs$wZbPlmG7$D~l( zAZdAA`ZpniwT54%nIbG0r=OG()ez_($!6#dbtk1PShlB@NgLJvFC%?VL1ce+8(vaCv~K7&#+WT^V@i~bP}M`yTYQ%4Hj1hm`Y5x zQ59}BSn}2_V(lJEZyO;IMCy0~OJ2f8_T@uqU>iKrwgZm(CCRrv?~TwO^Kw4Y>K{t5 z^l!mBKbAs(_aDQ6Z^6z#h8d|jb9e$BvpF081O`HLcI1iF)}{7dV_I{I)xaI6vsf!@ z*yATsFswv-KK>LZaRESJc+u;6{0$~`D?WUF=ZR7LWmb06SVi!utj`rL`Q#Ep08&rGFW;{_gTELhl+_R}Xn3sGabTyVB6O z#qbc2vny_XL*tTbLL)rb-%dMw=$4zY03ttaXtW1SORm8#fhU}8ODG7i$5?Qi$9%{J zw3XKm<2fZC(WPgk+cLXs%j`tjsYA7p8AS5TY(dkK%eKrQj%RimGKM|If)o7E4_VuG z@&pII$5@b`7xyVkX(zXJj{6W&RzpIe+uij1{c-bIK|6V1xUKfC=Kn4>-C+KYa&6)N zhk^&UmnYCV1KZGEKJLb=`?3wvGp#wFH#GLh+?BJKC7ET{|Jci;KjVA%=PYG8X1UdW z?EQantb@1Q`v2>(@BaTdw!~Wwq_qb2!dt#zho~>$VF$OJ_fl_kdbBz>s#= zSkM47j?R`Jo77QmspmQL;03cq5Dc;xdFxOuyj9AxcPO4$kNjj_HJ%Xa!HelJ&(I^^ z*1LM-TjOT46CGtQ15cAa%LGri&{Ljvlv^d*^E=Yb)od>}*>-%WS!Z+$i{NwGp&GYt zzGh=V9Gp)k90q6k_)v`rt}f5a+gA7+u6)0t2keJ)Nvk&&l=4Jw_VetKY3tCo{CUAp z%@o_gX%7xDtB>5&evZ(4KV*A-?tvqM zk0aK(liaqYtr^C7@!K^Q4iIs%SSpu7HW^7zA##B8AFBPq{@s1XL-zg zrgfHw(5uizJIiPEIef}0+LiM;JKsfa-I)(QV}ZSM<}7{LIcxsuoByQu^p)M+c>4eD zhA?*D@|D}#GKby;nJ@B{TlVC~@}RAZTlR+IVJOGe`SR0kGXw8~$+lS_-wHYX>MOU= z^BvFuVamAcE4OZH&*GI12cP_~bT)f0gZYHdHgCA5J+M!^(BJsc|Ma&* zx7}cLL4UK=gO4L#j;+7hN`*dZzhZ}O@!`XFVNL_xjBc+-+FCosepUemx1h7}jEn`f z_D*Ok-#)h7p$FP)-jIC6iwBmnTV0_$=6qr6UyeCnu+DyRo8~-x=(4HXah^AH2IzZT zVBcgvx%aDSH~GoUUH(oBeGby`JWBlJfI+tSD{!|cfiY#@0hfyBsDqwlZ~T|hwz6!U z{_noY%j@IMuQ1fb-urAF?@$$Fmf08f&i0o*?WJEa*?JemH--3({&GKBWzhcgmuJI^ zDT6kur|h6p;r~v}vcwN=@6o;Gmh_x~E$l7#=kD(<52j}gYx=+qLdGc)BDIl z+#`KtGxug6xHp#@G|#?rXLvF+uo1y>Z|>4y`2;;}VB`A9d*~^HCijPB0xdJJ)&t-O zaP$DVJC@7-`}?OG@L=OW+5gong9gd% z4ZlHi#F*$WyD3i0li50<^S7YEDxXnZZp43%>XG4QS^zMK7L85TZ74yOsV?CT+N zFi%-AL{8w28!8XuZX7Ca;tm`pkL0c#Ca>VO4U<3T?hcbjaT~(qa5>aoldf2kAtPjeno`T=kC3PE6z4|B^U2R_ zWVj6P@wc*5;c`3fgK)V>oB~V4{E_ku89r`k9U%{-w|B595%OXPUW|~xAwRIhv2q^@ z?Yw`iJc@kJ9LC85fZ^lhw}1`f!=pZx50gx-?Lyg!xL9p{wA@$#Bc|kGiX&W-kOa9`mushBCgCpu-ETVC ze+y3jh4eldKM+ds{2g}ou9*jOR*L7Jr`R_MatHeK6g!q64~E|aoEFK>@NQZ2Tr4jl z;s6drCLK{{97rfHT=jjSb6WACbYv zv1O1Z2{xWDlY`-nLGR_Vr?QOCr&(v%)aCM6h}*YZUIA|rhNQ`JfPHE5Uida<`sWZz zWoJ1lY)HC|eVkq zgbn#Zo&BW~Y6C|*CR{2Ovo{&I8=68TL z6)cI6N_hy~cq@A=6XpRJ18Xv6J)v>6?3>kcYqLG$)0V}0KF#o{Z-#L^e1c`N+JBQa zZO*7=Rjc9d16e#<4bO(3X#Q*DdGKaAl5NY9e}Y#O?|vo28`2Nhq7CvpH1{p-$_ClM zKk&J;5e6}tt$BU}0|Q<{bo~}4eiEmR-Yh>LS@9D2dVhck$BR(fZ{; z`@`>I2X@Q8lti-|d*NbFWuNBCAvCj=73IpkA^0#CPH773{2i1sg-!epvY5g)ekXtD zu+ILIK{BMB_%FE&-BPP9`hD9RByXu@(mv<}6WQ5)um*tATnpr(WP&!SK-TNX zSZ(?bawUbIfCe3qf1+eGlZxTIMzhpnIGs_f*&%r&uTr=WiQm~>j^?|^onmU}Aa9yMBaMOCoc(=cz|U&YijP(cCwxiTn*K67OIU6nuRQ%7x0Gv#MWA33r~Zi>6xq`w-|ZrFc^ii?76 zc<4;4uQ~XF9Wp5HY?qtj%#;4;rgU^Xbf52rZ^FW-X!UMNAZhZz5#D5{CGfZ{FoAV% zr3AFhfjK`d&J>nBlBvj zYy!2P+A0ThwAPU=>7XQ)280)W@(BZXUNtGTgPsSdtAn zP%6KXn&1x7rF75V`}l3JO9rIveRSScxK+Jx*u_5YsCdxuJ#1%3C6LxQvZ{_sM?RP7 zeU!FbUmxX~Yc-Vq#-pUC^X3N|3y#6?ZSq&%qn8{tLx563=%0@4TsLJNy?UCB?5=d?e$riW=d$j~+rE{KLO&gRqhncB z06iqWo$ayGvI>4cZcKjM>SLGm%q509P}wE?CuMUFWq}EfNNt4Izg~;GL7!`XpXKy` zukGsa*$F0YCi@k{V6SXWH0)_=ky`_w0nJ( zw;V}{HguTcrGwv?W{!X>ZphSDj!=eB@{@LMq_UilVm5U&e7+yNqhO2@A;LGpW8YFH zk~u8*Ev1E1>nDwA_jnhBZzAWg^0$<3j`UNlJXiMdD?*}1%9CU7yE6p(t*aW z)*eh&ewWEE_FD|JX6|b3NsQvCgI|TDca;fbn>O`b$b@XwF1@ET6Un#i!7Rm(-q^vs zXT#x@tF@7{75E@x155n?W|j5YkPqQcC*UWZ)$s}=Uu#|GDZcQH37fb;IZoCv?@yF8 zxa(@UpD3;rzCk{+P&q)XEIC2>hJ4Aq7b%b6JBBukmF=+PJhE8H7ac9O!C7NTjy=Uv zK2w64*dxkQi!HnOV&?c~$|SKAHhhwm_To9%7?rHdCK=j+WCiBhbapKTy3yyXU#jv8 z{FWmvg<2cdXkC{oOh;0+BcChq9eg4?vQkNf<UnxX{&yDY{Q^ryMzqHraD^=7m2kUOF zf@<~L1Xp1WOWg#`Ih#q}D3@S)a_t)h?vioLVKeNEV-q(k1AwoAo^ker;c3MuT}acJ zYv`dD(1WSPs&j>Y?YVeJ@?+6rZRDNYCHXP@V4!zxU`@6tuQ@*AalcutO(0^-7A1nb z&$ez+-X_PGvQ@c4-eWhmDnGO=f}ew6!F8zi6aJGBF4~ft!Z8Q7DSgPh?D00`3LW=4 ztK6;(6wR&;npd_GOyE;tZORU%Ie}%*yq(Gl62qij%2Z(bE@cjIb{9mx!=~pzTcx@d?GY&kFoXG9mqhf?Fr=&h0j+0U8cnA;fsOD^GYf4=mC?!X}HCh z?M0<0m;>M?P++`^#To^L460NDbkm?r)@_2+f>|mRf6_xcP^oMpq#K*^r*e`6FuzO6 z8sNw!I7NT9uL@e;k2zdcb^`k@!+7Y*MqW`;#hH-d*(*w0F$Nm-$rZ&!NEhaCRT&P9 zy{fz+&V~d#t}1iHcOgORYw)*6#7CQRUD--tY0|6~8n%NruU1hA@n(x|z_>GO{cb9g z;cJ97+PXTJOX#0F*yCI9w?u#L&_1}WPPSwMewUq&QoO&X{OzIs+cIP+s@z{xk zQ+_rZ{z3`jro2#wa!X&poR^=iJ$RvT|h`n;B1SJi&zc~FJ46f=KvjPGbk{?uNh54&KTB{>l)8>Uys8u$as zhmMeMGMvW_y;{igddETiLgsrc^Vu8+)k|yPsD=>t-)a;AhN|(4?U3*H@$j!7_(P{9 zc{SwrkwKkqdnmhLP{RWsR4aU_!_V3(e8wk${J(i1z;C&_n#&W ztK-EDaOiqx^)>p!ft_4Xd8bZSu`*$-Hep1@X`ZrgT#5zcNsJZ$PHQZpmTBxs! z4R7V4PhjnK@0Z8N5vgPZpE@RmTZ?3bD~u$mTXxL6K}UUO3y@oWm*R5Q2IO&w`l z6OnjV%Pd$E4S|Q!GW!~0KudLc(>OSD0%x7BgTJ2v*Sgm-t)<#agpyc&OLdGxCZw?c zsSg|8O5I6=&#{NC)SjXPl$)ai818Frwb+KQepXN7Il#oe1=%6X%BDp#mm?HBOr z)%W~A&tci$>hFBMf^YA63V1q;^-#NtyfLyo)bS3!w$quA4h1~%P=mxWIGsRGbrOGY zTji-X<$uJn%~RdxVE&JU9oZr;bvwQ7z&f;r&JKy+Z>#p>;hk;aTHbQd8rrHe;mSF% z8ST{^T5*QCn$=MH{0y6DhEa0Ufo(IZQ^mGW57isG*bN75kGFaoJ~47&8+_C^Y3XTp z&qw`~Ry(lSouJiV>3qDCI*49_Zqiwu%v*C-XLU4$wI4gH{dKg;L3`n+E++Jn16v%R z{z?CI(57@#DW#RCwP`)nK7?L?yaUx?B1{XcCQw}sg=mRARd@--C&$KK>MF54RLJ_8 z`Z}#}V1K@*{!SCJm=>g7YF+XbEHNt@t7fc(yJrB5n84oQ<^YS;-TyE4U2n+iMuGj( z_!Y94K59toi~4L?><_y}_f=;*^o1*U&p3rYwExmqeP0|1J9-AIkEkx6`Snv6I^;X{ z#~}4rcmch2u-ZYiK>Ufp>HzX9YZ9s!0l$W-!LW2;`XTBCa-3ZoqVfg9u%YU1;K5LJ zFMLO}b(lIDcrZ-eNPc0f!_=w3gD{9Z%7(mdW3`=YoFlB~a2u(^p@SV^w}z`DZ6)^` zp)MEKKov?xsH5$V5cAw_dK(KOB*79d!FaYm<7nN&VZIkvLdrR#)HdW0vy4*BJPR#< zl==?5kSk{`$EXLyA0WjaW7HJ!5Nt%hq23hN!N#aJ)uB@kI0(s4=fRyF77*zR;4{dJ zaGm1YBtM;RNj}3jU6P;1!w06YX|W!Ie;Y%8f-VyT9ldK~Y zI*qkMVE(p010}bD9hPxwH+cJfaGcu7mhk2{)kAC#I}GF1aqvF;o$+dj7zp8QOo;qu(UeonPWf zlhg=t21NX8k~+YhpH#Pp@bc8B))y)@cKo=vYSu1z+_>cDn|aJBNa?}Z;OAEMD=fHTM#@g&wzoEieR=ivFqh4oO{cbfFR^(mxsQU>I-`J~u1}pPf zx#}!yL29;yqJKIiyY_H0&I!`AIDjmqViMKdT!>c%Q^xKMcM1 zONhxjtd56AhWf*57%?8<6%FQZnqc;bJ*N#js`e#=f8ep<@U0j`!(P6}-U2mA>)}t2 zVNY!rxV``VVD(?=BY$T{eo^10j`vwWiMosGerFd;)SmxY93*OcOl{fptN$*IjXVao zai8B=#xd2G2H#*m9D{zBb&g#+28+3>bIk3yy5GcKs~eX+f||gZg1_KdBxJA3e}+7UjrYg4Kgh_vo^c(bYo((nrASO)inoZs2&WiY1=VBeLg zM`=O@`{a~b4KG)tPpe;uWsuw5)9R-rn8lq@htix1mVZXghS#c-%hh3UyRXo;Rj9{_ znOA1{&^9xE84CXiFn--(bru3&MBCmrcIHdA&>LFjdDX+-rT1NcF8vTLUiAgFEfH&= z7>A4M8*VWF^PZ!tDO}drSliOJUNWUu*0PT-s%fN~)^Jh11b-E;UA&@pR74jyxL4j# zE%2dWXI2RI|VB26x4*Y}F-%@AN zAIsQ;+iC!f{)468R=bn7?8n>c6q@x1^SGn-6aAr`V1)7cOjob&qqO)B zZT~$L{szOH9sCOx1L08B(f47Dz#HVP57dt0dvNH<2e341$woeeh1zlm?|rCx+NOn5 z51|e7A%;FuyNf4bpWj;@AH#Hb6=M7!!>XUhjDM`QZ(YqBN@s@C z;NLw)q}{7Y`u7XCNnU}Js~@X=?f9L;V$HOjec2;;Q6Yrz=bHLIpy_&B(lMN=ko#lo|it3-xmy!y~$$z8Pe82fqVOqmJO`MQ`KLrftc5y!h#_P zV!nC`*NE4-;;9S4A!S0pUSB5w9ieK;?{V!a5--gu_A5_vsmKU3S; z&dB;qeNE&~^1nS(J9m;=!Y!EpWiy;u0I!85xeJf7{6~}vl5??Ls4;ZM-dv#UCNANYeJPf4Evzpdm*`bc<@FUk6Y@aIKGWIeorQka9P z{}f17_3MeOwb$$6wS&YCJLpr1jtzIzj|RSW)KB!%*{b+v!v9%C_|Mis6|>GWSA#wT zeghb5(9aMPp~gSLh8MLR)o$Vcd6c8=sN(Z1)JZ=8elkdL(r=UKbG_!E(+e}{y zfApf)`nS}#ru4p^O=_+0*z_B?1lXK#O<=M$TwtqP>+iJwOE2*Evyi7->O;%^`qQv% zb55(b>^GcY8{PFCX#HvSi@W}FdQZ>ZYNP*#`>>5Zu<4Hfkz+p(eVbtQCo8Hy4H6Hr0H24C$<)MF#*6UdZPkjiDzQAHV^}#&6%~Kywt1d8iFa2V=`~q9& zrT3t@7g&*(K8xPgvvF$j z0=br;~P-MQZzB7-g^3~VVD|&XitNsUiS(j*x5E0lzKZjQ8+14KV z{&eI4cCCj#MXZJ$ae?|sdO^>Q2I|*~^$;Suef6ElJnc^xZ`YG1Fet_oWqjw(&K6mN*$A`UJsA z4uXx?Aia<11RK^My|?HM8~cOwZRziN?av@REK|Fs@b@qG53$@6vMZa? z2d*{jUED{1fG<<~_0^A|<$7&JU;RLlMpSC2`|Eur@~hT3Sbq}!21PqLO#hb?yq*7f zto{=b{z|0Z1pQ6%6DY9DM7;;B(}z!l3$TxUJy9P4+?l9fNu3#6GD-hF$=7a8($C~C z>&H&kXT$RP#bo`j!0{>i;cmJA42pC)J;V(fa=}EODwnM*ITGsh+A& zGF8H@7jCm<^TY7P$NXsh7!&`!&$v_%%YJ_Ih4%;NqM`r6YTWH@7~p%f-EZqh+3&?K zrs-$U+)6g%9sN*}!*;);&$O+eLu2&+7i-@FS5x-(@9cfLoOAX*yGvYDmrsIM49E1>ol$*#LoTJ$e4zh20!CcrXlxkD9ZkS*FLAwcxT@C z_j~{E`Fx(U)_T^nZqItwv+ipTU=xtasvwQCxThXkK2EHen=hU5oDI?>FkW|w|76Wg zb|2{6rf5cqbQ6dPt%;tp8}`lUo*S8>S;dszAuYeue9P3mE1mrkJJ1;1|1ec^6`K)j zr)k0%*Al`_*DO+fPrjKBee^xKJYD0VN+vclG-0Y_Y0(T#kV+ZiR6&m7GlY-PNfqba}*mV0wGez>~g z_Lb%v7{|ZTjKsc%7^)e|-a_zMp_(nq_LSW`4c$wVTFld|RtdKC(bt{$34Ac^RoAa1 zne#Op*lS2PC|om{dQ?wlVygF4RnC@4+vAAtbmvFy|I(j|y$ckf+Nu{cYWMqrXk)wp;2Er7>X3L&}KO z3{c~43UAO%WHA4kWz^vC%84K{KTdO!88?}9Hfh>0?I)8-CQS+shzKjutt^%Ht<-e0 zVFr(t9<9|RusB@t^?J=(djCV*teMCTMApMLYy4Se%21NEO|yo%K7<5rM{V^OB9(2| z^kY@iq+UBTXKYo0lDuCtk7Z8uB=ZkJgD&eSr5@5uS7VjFzJx4}mcExX9c)y?q?@NT z*j}dx3s6l#V8F1mnlG84yX5#;bbLdId=`4*)?KOBIn6#5lXO>labD9&#V!S1Py-HOkfFuMgFYGv)x83 z>9F5DY3dctrz-XYini;j#+6}C-Xk}zX$CQ6_ehiLnp`IIK2iUo`GeVdpFI6V)0w$+ zpESwQcri5)G)H5vVsmJeziP%YE$@+gzk(Uv?vr^pG-De^+{3}8Z3WSlua&_LX4Qs^ z!wZhVx%7gjyY%vg24~m0lA*UW{nB?4Wm-EAX0%Qwh1;yhXJ9sroGrbArl{K7Bbv*a`&;OlX68geaO ztFvai0oAycHj=5ZmA2Q?I;cAjLO_$z8~=j$B|79V4@HMj#DMe0Z*O6JfetKbl+Z^Q z-iZ4hq^Zu@U-WD|;_vCEZN<)mWxAWTCli+|?QzrA)G{}0 zrPJNDx2>3LTWLct?Esds-6ZAr(YAI_9g&1FT0HTTCYi=+Jz2((O|FjDUS(5}(zXfO zL3|y0ZVxltSO$ZZC=BQ^Sl$g!)lBVRR<%X)ovrO}!#)Fs%-2q~WskxpM{1MQ>~+{S zM{6TlWg6fdt6gWqq-0CSS83HMCM{b!8?XJHSNwKt)^aMRbWBzZi(AI_YleE5jY87_ zJ2Y#e?s&FzWsBBZ(|d|-jhM5BUxnzcs6OH7XXq;ELKw~+gf>|6VJ&RQe_0DN*UH&k zGGe!OLA(DXOkpm0w_DryKMLcQCk5}(<}-AKK^neKJKdUb&Xdj@*2dIS4J6(dwJAym zUwTp7Pni#XnyKx^zCpExXKGz>S@`=*?Oi50Pl~>z&7$pp*iTwft@4zm@rSwFFjpW=}=fkP4NHcruzGJcCD|+jO|Jj$2 zU~kH6RSmEF>w{<=dd)kSICKwT4-`YiR)ySsQW-Aji<^@W+DsvOD&~x;iyf|;j7zC!DAt&KFdn{PGR{S1Yz@+FVuQdI z&&j${27`vqQ+2x)Imv?sXBdprbW>Hj?`@Qeex}2ilDwa;^VTI-AyR%LLuTlPU?{O^ zhR&CcD*l1b`@i)P2z~oQCb|TbJo%=*sY6o-_ku;@FI{W@J6M*X`qZk=weigEc}DqXMZ!?0g9 ztdzPZ>0GT@95s-pZr8b}k-eQOcj)x2eZxw!I!zbOy1>F6(QRbAgL=;q-E9XP3TVHwwa>DM#5 z6&j=>jsH=%n6bU}fMz`60m=VKHmu6hioo1LX8{U&j z{y;><2DJ;6Wl+Jp!z!vFp+AL~&Ijp|Yzxc;h^M8m{R8!xm@JH1n2(CtOkrG5b`-{I%HUAxoe{2C!Ki0LX z`<{*zyI%&Biml+-l&<; zzmp9my1}*YA&SS1ULw_K7y7|&)8Un1{M!=U92Gks!9rf?Jd{nleXn$0^!x%TeWmNl zE{0dT*E(lq_ios0oudxK^3xbof)PGpuXV%O1jO^xYh9L7=j-3;+N;<@koc?$U0*gA z4W(wKj#d9zh3!?Giw*k|2u9g*-{XPGN?WcA`xJ3>(QvD6m7HG|IXozd`T3oiTvOVr zRkgT!%t_o>uf>gIPa)xhwYb{ESc_}Q{$O@_v+1yu*Wx^s?zNje*G|PA1ZrAs&Z+iQ zMl~dM+;Nc2!c(YjZW(-{E5F za9^u$$AItIII9Psr7lTrlg<2xGKX+)~E<2|3q_>%=^?Ca+s@ zLz?(BqI#zn{spgVG4v*(U#J^0Nf+l_O$l^;e@bi4Raw?w+nSS@d}}hK4VS}yg*2ME zau3yP0OHnn;MQ=XpJ4pj#|Ha*5l=`;2hNvCenQ@L;9jvCK~mC@D`S_y@}d*BmK}rO z%iXz6D%Jp7^DbONhVI-+{kw5E1`+p^JnPPNWwM`;W<5}7P-EqQYr&K~AuD=tN0_Xq zq~|AGf5!PKS@8+iiW%{g9QuS?&HQ35d3$h=)##YrdULb%>?MTUHk8X?o;{Vm^y3<{ z>;Yhm!@1TB_e_$8b2kO$?I5n3ioJoHTn^&$)WQy>5|zvTC+Hr@@o8K=Wsij1oX)jn z^Fe4cgG*-V;<2*t1z=@|>m>d#S65mT!R@wEnWPrWIIIKE;)O?X zK|0JCk-2)(o0Z(pD%Q>!>&-uLj#6$s7o$-&Ql@U;QrK6>r^7~W8EflYDJ5;>GW6K6 znz)B+rDDH85Yt}p^-D`wvN?qts&v_fDcpN@MHB2O?Bkj<6RL*cr;j$HgmD zYch_o*oCA0>zR!0;PS8uts|@mXjr3$hqcK_X+vRxP6|B9ol~`?G&D3$MnP)|%L}1+ zJ#0ywQF6sv_@EF#F6!cGAQs&NMI=1)ssN z;}OTobKH2g9fa)VIqpYu)lhbGm+#QXELEe(Y3wP+6C{wNxv|ACXVwO+{u!;hBZSyQqtiK|P_-sjZzLiDG2SJRYm3?)MU?rJ&_ ze2vIIa84{Rq~S&G6U|gy^D-rzhI7k(u5%0jPp~wgC`h2_N;o}eRIEH=|-f+DFxl zPNHMY=LXZ!Cm}*iZK2z{fp%yjrM-Yc(P)X=&)iJ*7gW-gpSeJWwL-(jebQwr_9<{H zu5qp^wh*?X*SY&@dPs@fHKXjgk?gn zk&gE`Keh>q8*`8A%$&JKPTu1}xesigt+2h0W!#?% zFY};6auCLl$8BJ9VQKz=b7xgmY?=3fYr%#9z4Zatm~p*EExxE&pCwI9%S2pT-nECq?aeiCqc8y>9$$9%X~};TVNBsv$3achsF*K;`M@j)tT3AkY z=HIfr%*Clqu5{tgFon;^s;)3^T1kH0_=YU2altc@pYVg(TCkk?gdeR8P3n2@C5pxF zQ{IcYZiOrC{6>n9N2)!xQgzYO0qdr81pI3s-j&IEN!Y%8cbwPzv@gHCS``-bg)EOo zM4?`MQ#J^e4PN{)h1(;&`AO^&U`}}RmvuC9Bb_=R%J_bKT_(Ix+S8B63I3LaQbB)e z8Bj=?59Av$>O!giKpuMsKagvK_`Bxu$Y6es`8bmg3i*>urF6lEpN1uaBGPUM@5RO; zovn!NXHpBL zB!50oRVNy}GB4u_-gMmq8Gj3+nxlA^T2$nWHNiqBv?%k`gA5!6CWRM~`J?z@tiBmu z4jqL$$uE?|(P-j~PZ2rsIUl5_tF8w7*vBu+uukxboa?QrMLfswE3tZebqt@cVs;gg zBLRGKdbOF{3*ZBpoFd{mjt@{Bk~WRw|G;3am_$$DKM~Qs=>Q`Dl03q6!mU}Qls$ou zQaRw10lGRIj-|d+u@>QDqUQw3^oe{2))Qo#C-QiA4jCZ@Q}}&~{H*(uZ?8;te)NYG|B!?Siv0OBNazq)c248l zYv^#sbfOiUxxh2&#iZ?YzBg{}%$m-(P|=1et)IcitLznaex^oTb-_=2b7TH=Cf|~= zEg@gdh;G3f5tm{&{L_P**dF(JMbhS>RmQ#|&*$+0$|*{p`TPR2CX1rnzcL@{Ke~iGnGcz#XDY{s z^X_;VHz6GPXH!sSvH;SheogK!;IC2bow|^3*-&X=p6!t_+Bex@sUJHnu;H2_ZFH{} z@{O5YuO-K?`8hcE_KL(s@Le1|F%GTj%!Xjh{`i`5J}&$<`7MI)jJ-Z_5kHO1M4F2h z@sUbpg$jmqh_?N?Jlyi3`L$cjw`1zQA)br*k?ehtu3rqDMt#1GLCQZVDMzh) zBYDN~j#w5#xIhz{R@`Ihyon#DYJ!tof8O0(ec~;ZiUFur=f}B9w>|~|davR`a7jIN z6;H5FI3S+K1uHuI`D`^m9T&Aut%k@^ce6G8r)(+$OkTrxYph4f>49DwQxv@=WlEq| zq5f1T2_QSYhTp7;AmbAFwMzI`34DKiV&c<8{wSLb|JR9p2KyS8Q){6JGf=Wk-|`)y zP_9eY2*1;uiLcl5tC);8gxkQou(v?mX#;)?F_2?O+0QF?;;B~^E2AxHjsQAPmt3-!uc_MT-_Aq+yV?j6M57T zS7oE@)Hp`jP4Q@l(6zVl-SEubkS+W;CLYgxY~f!sCO`81Rz8inIh2gu#(SD4o7;Gt z_MqLM3o(4&QgE{ZT03%L7o7RQrCK$~UmZ{u|hv5>6(i z^3xRoxSGo6+K;3B?(c-VG1z_Whc1S`=WyX5L~9Zl!XDzoRSZ`~79EB(jwQ9z_&2yT z&mZB>GC_~Y)g%06)#qfxQ7FjI$>pQ`QtWW!Cj58?r&RYI<7eR}ef{J7H8lkJ`Dy+> zqZ%Upe2%waRfEZ|8GKlWgJ=Q$)lW0cD|W_Fo-x-$dt>o!zZrkPg75DM^(JD}n?k<+ z0fWLpfB)<#qDMnHDJjW3_Je@o`HZLv3qI+X8BYbTI{sO@8Skv%m6gagYN}hl?nM@5 zLoxY0A^WoVUzlJAvg{fkGY6^Fb&QEspO8g4 z{46H=k@O-5qZ3tk(&h%=2DfEL-QYtzoJXpDYGcAw%mEF4womubGlZsuiYk1t1;5se zkH{buH+W|}N8R8if1g#kOO?0zRrFNiG$@(#G#b_^E z5FS60+E4i;Y=j?r%GXn?oTZ;%^1k?H6RBT>Ox+^=i}=2H=-pVvzo$ntzbQeMACSxv z@V-8&EJ4NmE_EnHyT=_n`4zuWqp~9#Wxma)&)>?Ko=yUq28BrS;JoEet z!IUz+l77_bZ?brDahst31WzuirF(YzeyaMFYI>f|6N43ZdY%j$`(Buu4Z!L2x};Z4 zeM=_TkxZ+pU&2(X$8@XH!~CKqbv*RV8T~sv)~!FyTvto)J@oq-CR;7-?*$9S_v8BLdoix>$mTwJ z2gdCkk^1Ntz%6y`tM8-5JHuhV`W3j(Qt7MjWas`4^+&(&ScBG|Q)l8iMDL>NOePP} zuS2VNKSaL>ThtLl_4TnAy>6(!k?K>DK2+ad)%B>KzJf(tuNJ`B z8Jswq>95DP3M9!iT95Z^kB}Z?^lmVwkAVyvAqU6k`!MaD@O+XUM{cl$5EK=rE& z_0+|b;4vEcpn@EsSRlw1qsgsl`a}#xXHM4(D(B=X>dzqdV<* zxqi3fMC7PH<{u{~Q;XWvKhV(M9`ZDY3}2zYtQ~L0MpcB67P0!zYK2flQ*4)GJRIb%`rS2}@>gQQuys&Sh-i^BNZPd46 z(Z_4I=!d}w*n%opNxp%rW`L7fqbp-OV_a0I6M4QxpMdv*mTuKI!I!#{w}L&fq;jji zJ)R9}w+*fsvSORwCDNee{IQKi*6&!{7_*rb`~t-vEw<$&oU6~JQ)fKs5asc872b~G zFIC~=OjY=@hrp*+;Zuy}bhcRH4fw|FZm^0{MBRz33U;0bTc{x8B1r%3`ev$`(#-Ap zES25|eFhR&ItteYq<^yBg@HLUS^o)>;zaf&>zCo?Oxy4Eo$yH2#P9WERXBZiYKQ(R zi#-O@9=*(9*kanJ@1?@kmY?@i{5WaTA^kopTrC-#u5aVL0V3dO76H3V&jC~xplP>L zY&S(P{M4DAK4x^*W{Ots0YI`dJ^jpRr@iDvy1oI9L;edfqO-R;re6iH04jtw!ISVX_@N2LrAfjMdXtsPn{2wQ_rPTN_GPFNhvV??t;SWi zfU)BzjJtp#yQeA2^$4-MqW8i0W84)~$8l-(6@7i3s*AMmUwT}_=pdQy>3j3&NUuKC zZ^HcwY36hNKD@+F>c7&r>G&hmu`xIk5u;F+48~o7bASuF`%KTNW>!}`pjm@1$@-0i zzk&w5OV+*8N8rjs?brI=4e!5`!((0}Svx211iAY_=Q>2UB}68c*2jhl+c zU+bM!_od%n>pQTx+Rne#N3;0k#F|Qd8@mF^6SEGcJ?0rbaX4F1l;3`Xp3kPugxy{6(!@SfSBuHI%v&zK#t^|?nl+jWKoOJM=?kR^3m;ma;CkG2xh zm_Jw&+*)YOSXYpBt%W8!N0fxtgj3xL@Wde7+UTtVh^5eC+GK?=bx^q#yTG0Hk?3k+QY8ds2Cy9jF-nA5rnanyX%RfwQw zST~_VeJslxyxr00+)>%I0X*)7VIxjWHv%NX^c) z+NT{%KOnN)sKpi*1;d$Er?jgeH$8>nSTStVTUf`$KEnE-(7$H9^7K2ddYjFSagWIC zzJeq5@7_naK~1r*;QQauNyffHV=D;#z5&8+X5Aw)dZ5sgn(GD%k)5_Gccp6#RW5~&x!LqVIY&$mCTqYv}AI+;!`F=YYw_IyX=bb->#$TYGf{9A=Yzh#lcK^gLWzp|hU`fcylX9{xrkeY zL;YApr5+y960=s=qroZeeVc_&7$xLw7HZ>6_?ykb5fxtP-?vrZ7-aPLc7Z-U^LD#1 z2!{7}LNbge-wA!OTF@?8Xvd(S<8}zk81Eq@Z->x@;fF|$JB3^o24rt{fge*y>)i-D zg@o-ECcyZW8aQgcM={o#jaPdRZxGqASMY@K+g@QeE~v(&2wt_mVCfa=Tc#+w34`?@ ztP+x|JGJ%5>lC4Btw~kB*sS@BZ@+!ea+65dK4H8vo4>J7*x!C4T41P4OLP_(>KLt| z9@GYR7um0_v5v`jyDMVAyJ0vLOq0Qkpesq+kNiv|ul5U(sz4HU097}EOio44Cy*bg zF`hI#C~Sf8!$IMyY8+X62-O-utPUeU0Es!QxL#3XESa1pxUyw7?@3~s5RYYoW=Dh- zF!mi0yp^wKygGs?utM;IL>t6N()Jj*`LwGv`4}XU33x(MPY8{fpeNEVC(z>2vp+v2 zsMJg!H&S>`D8);Af1HP?`j9afP-8x1!v&!=j2|utOVR81&k**(c$Fa#)ga0AgD^vd zQ?cb2g*GsnWeSNfj%NxzV7$siH2ul+O9JiYTek2PZ|$|dhDxVT?g`Uyy!QSz;Wrj<%ANRCSd965 zj~j|{?1m772~4k>f{}iiF6*YyiosVe-rN+LF5WfkQzM7EgrmLjYKnL+*G`pfIB)QdKliqgQK1z+JBp8SdB+>Q;|>0a1{3`Vi*YUe z7yT38Je=X1SlPxmUUk{vz2r1~payinX$}6-Zyr+YC4+yKIRi+;Dl;Y!AxYzhLY+wO z>~nG>!&-VEi9JHmM*o~zr>N63P%RwD6b{@_oi}=C8`h2n?#)9(j58&Qp)PnRF2hhiv1R4k%@mMUzCxF|ZxJ6n$O&!JK3{Ii9a*`Vt1+MEwVShjIdv@tSliNQqUG9olS zzM7JaH`vuE2C{DSK5O*PHu^51Z*0poGGoF)NBNYg#6)cd4*t<*e>N=M;FGO3nxj*) zTWe7wdxVBUsYRTvuW48t0avY4^j|nS%flu~+i)S_&)5Yrw+QK!1$Uu{5PP^K`v0;EO{@KI6W>ANHz zg+x*REGMl_<9JI!<1GdCPFG6ly_Fg$le5t`(d-3>i_w1xcw+QUXT7(gOboK##NdC@ z;G1mpKUvE^S#{CqO)12i!cjbi=&2(83xrOQ18W-2(-Qh7WqPlp5wqUwYWXJ_O;o`D zwv^sW)ZKkiZ4E$RDcTWQPNRu_ojX%0xspSrDj~RL`5nj@OzsL9ZD3Wx9BGke$}R}9 z$?IUDrKx}cYbiY>S#{1>B@gCV^#3dr-l;STnclmUocyym)E|u`s62$FnzQ*CWAIO_ zwI~hwCgTc(`eq@hsjw)8_1;y>Kjm+^Vy=5M{O;c1O+UnH^tFvrG>1xMqD59IE#2rF zr}8}uKb68RqlxO4?k1|3gjK7sodPdo40WE%W1vtX4ZiUfjws@1Fj2uZQ-D-BV4}?@ zsz4L8o! zv#}fZnR>?MN`v>VR*~n7-n$I`DQ_Ms8iZ<%bqXeFhHr`%SgNT0)FsHmS{fbey~MqP$~);BKdq8xQo zHoVmohee*CQmy3i47OEj7#5`(7bTgvONJ{3-)y784MUhkts2ADebkWB(FBZZ184~q z_Qx`44QM22X?C#1_Es6P*|1%)y&0-je7DwGl)}2(){3xI8_pSPAWxB$$saZL>NOcP zGgTF4j5*9^RSw?>-fy?-UfEwH~_+}U*eZlUtv7*sG!<^>~CC^5GbDp8+Y}xj;!qu$1U9AW^ zwb6g8$=*(>!2gnK-*r*m=_*A4Q3t8rz0<>18rRZ{n_Ig%_pF(v{^>?VN+KZ{thpxz za~CBcpH&iK@JS3lR??{;Jab0>WL-z0VYgm;Z|cT%HxS;4R z1geX9u(BMa1f&rla7%OG%b1GR#a_6dc}VRtqO(D{@8DB#5^L*9Y9Qu7{BQza)` zP#ID-cJ{(!>4kSq$fqxaNj34YC(Ga)JMq|^T=0M#d?C~sReLz1igqbQD9pBg!TbjM zxu$5BEGmki8}^VEBy<7tfq9viEtdD~>5jR&U~ee~0OpXCFd!J?b!_odyT=1Oj3FfU z)lRqIFlMJgZbu-vJDvN}&aXznlMwhPy3~N)0NRj7-3B;IIu;5he&1Y#{1cT`D6Dr}fyf>c)aaD>a}Suu|Dx+LII|Y>oGdO9>N2}9 zeJ&EFMLKmeGooO75mqpH(3_&HEjZ-AF3tZ6>UFb1mJ4S7UWlD|WiiBb205xRykIj> zOE0`=s7z}L@OBL42?D)%08xpWsxiBAsVm5(?$`@c;+26Z9Jxk+PFhkdthSDP@LZ1m zKZ1P36a2d+K8u+IGouPuEuDM|&i>=X(?CQ#-wJi4 zsEX^+-^5jgi~Kl|3JYaYLE|m7Iil$ zRiLG#*`)4B>iE9^8kq-8z^PP@wm>M%AOOYL1Jr;KfHcrW00J76M#^!p z$`lQd1E6kdQo1VIDtvWX>Q_~=Dm*oxS>kZA#7$|;IMja?;ok+U@~bjw>0E}C%8z}` zCbcI5f-N}CVtT<$v)CzZ75$I?rE&!zePpS8c~q&~I&w7rEZ_%^D$v3HY&?hoP7_LH zFF+_D4G;tzg)9qDIts1lmCD>g*cX(_t}r>6L4XW^58TxM89+TBcmQ=rX222x3w4A7 z^5L!kM1+ILf{%lld4sr>$aseAOfH41Pa?el~#OXjuDFBnEJP zhSFI~YX3W^KwK1Gs+f^xM;aVSwy$I_^{YS4Dm%shPe5t^R}lHX4fuZ)F7o4CceHTf zV?1>)vN4zP7Hc+7TECJLTfUM9{OM-7^Kn=ulabX3DJ!?Nc_oi;^-4Z$F{yn|H9K`z!J@Vw>=d`u)$DF< zF{|kQJ2SGeg}Bd0iBv^K18YC>tBNqT^#5ZylfYE<|(vRHL{r)?A4Z0r(Pl>ZazGolyS|;h1^fb7xui^9Jh@Ch0Z5)w zCZ7Vl0<`@S2?Ig^!>5+X^8uSIa1G{nbC98#pn=(ZMwuK8cxHhz;3(KnFO%;8;^vmg zXDkp|WqA~gQXzsWs2N%&rvQEjteRgYw?Zi?Q2R-6ryIC5U}u?}y|+xBy7})x*it6D z?*1#Rhr1d?Dvo38%VgcgGI`dvGC5!u*ouG@R&6blPXla$_XsqD7tFU{Ck1XLSP$rz z^jG)_ah(JBAYKX~$mnjw_ZpcUxT#Ej3OI;3U+#%S9jz;qhr;6}BB~Q#CQk&c23V~w zlW)NM8s-+jMF6{|OkM-H2GAsw$wL5ciDhynVC7nH2Ji$BP0HkbfE$2N#IHxacLF>I zH~}}!SSC+74|$8k-w;3_z+AvYwp`X&m&*&^l__xFs$BjVa0svgK=F|l)7xS`K$x9? zP{5q@a#?k$T-HkE@|8{H^1pBk_VAH%IU}uHZUdmWeTT|r5itE=xg0rtf4RJ5Pr3a4 z?sB>HE+n?2T)uO%T;6k{T+Tag4zTT5xg3>QE|>pME;mL321FhYTqD5D@5<#*0iCv% z%a68{%j*IC0Btsx%UdhT|+5v zEjYQnT)qLI{)sRtQ2Q4!+ga=<%RYopfZY`!R0B$X{0+rLmdgZ|P{mO$4})2iaFyu} zTqA(iVy3+Y+W^$R@Kw2d2`~>Z5#SGe8^DoLvwv0QR#@CLj63YL04$&+vZP%89ql=+-eRX2`xf@B;&NF6Oao9_8vb+Gsp$i=yT#rLrX%2W(Fd+YZiAz{ z?OXX5o44|Gz$m~9>$ggk%(s}0VN(1>tGDtC&0G0C;Lpq=EVVyZy_Jsv+5@UsP4PkM zxAH1L8vu1DvTx z|5rasR~5$r_^kni0h(IqsNast$Z~mVWx3oI@JEfe@>;-BKq!DlKocAWyBok3a1ZWz z;Mhlt440HXo#?v%?<0kkl`!TbzO%?a`3gTqyEn$w6Zm%jtic?d-j;EY@@*Muzef^ZZ; zO3g%I69n^fNbD?&v{QT!OEa7WvkI!{tIX|4m_j8oo{Z{>UiDVCkAJI})b7$74tI)J z^8rac5-sjDn0KoARaorq31<3?RZ<|=c`N6`q>yESaF{N@2LX}* z8GvU1;{2I-uAS98IUnF^4HJ;UzLS&G@8lrccX9;4$pY>$eQe)J!f^2hJ8|$kxh0?i z@a!KTYmnJ*JGNKdkYARge7OoogV+c~=IdMS6K9{5EA z=)n#@;BrZkL99m(j1{LbyY`cM0b&y-e?RFNAa-Q79w2k68Fz~82oReyIj6{#0I@Yw z_Y8SU%{ymE^KrmuoFhZVi4E)IoO>ycqzT|h)6c-oD(CZL**K&hfBvOhOYy;j958LM zuaX4Z@5FbaSeN9D6Pq$V7m4+Fu^q!@k}l(skI|WA40VTRl5fU~&6{LrzLY!DIC|@C zqnM$3=P&fOz4ht~ErGJfc4uKGxjr8G3c5tzj~BaGahG4piKN3mu@>=}AUZRlS4qeO zu_u$0P4-U^`&iY;c`1)39YRokAy8~=<$Uv{{1w?9Al4^-fugNdINW8V@o~|KEC>`E zG2L#H4S`~7E7#v%%CTgdQM4y_14Rcs{zzU2f)7D?q~S!Q-|YbzG*KMMIR8PmPZZl( z**O);3(1`2$VUMR%LX=i(;%;u0;$0}GlXEALiNjOGVQj@93u~Od z1Z%j#h4OlH7W;lFcD5P~cci(TiC>DYO#EP)n}#Hp5*NU8CONkUh1N_JyD~L=NuQ}m zI>(oUOckdy-TcVysVIWlpXjHd2tJD0P)pAUZ^g@}iT-5#G_fI*>`xX?6I&0C9a;Ei z`%-RC31~~eDRzXR;p~B10$eB9|CG~-W;vz)?(p9_QYn&ShY@h3g17-C#*aoc6}Xet z5~7VI0=RVGZjy;dYt|>Dr$d-Wk0A@Eqmd+zA)BWolR0C^>FFq3`dIR0y4c*xFQ8C% zCQE04k8U$WUq(HSES@2TFz(~Yiy3GHX@R6&u-MNkE3#0oMLM`5-o#+h$trb8p-jxe z{bMkM+j%*85R7zumJ{ol5N4;9h4K?~4Gfy_AwY^XdBfV+;eis5swcS}L$C12m;eG@+Gh#Q<&k}n&*r!1FC{ysG zj7zo$e^Ls`s*Pf8t)g9k2uvZ%W{K_Uc%J+~Otpb6UfB0INiNPpYxO%tGguUj)~cB; zI#}7|P^ne~tk-NLc;^8BAHe*{fz(=IzUcZ6p;gSMSu2Lt0H3amDtM4)3pc!Abu;> z`nG*dG{&#QMpn6QMY5lnnXA4MyEC^sk;`9!+3VcNTWXH>Agx0o3^jTd$&1LGsbKP~ zP~^?Svq)Y_@(!Tq*dHp^W7hR0mnmJKcagl23>!;%6$V~q4=j=olg8U>IuMUAsF8p{ zMe~XQbXd@HTv0k$j$XSOg6>X&%zE z4*mPj7VuO(O5-i5;zSzbKM#k#m7H>ye@J#bH){ z!9|!*cGwK}nfWM4QfQI88wHh#`X)v_EvV0MK}kb=u^1WBw`^F&jK=mYrFL)RDOuz0BzS zMd*qY7TZL^z3u=ly^{Mrk;t{(K{6>4iY@IRSs5vQX62k#BwsSiyc~%tNl7Qom%!ie z82M}os&@7zvW%LZS>zDK2W63aOVHb={;NoKH!HU8OVJ+B-Xota1zqYxvJ9rxR&BAI zZ{gZfa3g>#mcKB!g-Xy{CD|3r5hTw8rSH59ZSRhKF}R^rdGIpP-l`Dpr)F+$SO#wG zawwK_%)Ng0GW7bR9gFe&y4cn#y>YSJj|{6_6S5N}x?7!WS}dnh2Gqx~f6HdVA zrsjyAbW@@?zk}Jz0uq`JR%f)6^U^LN^DVRultPp!z*-kHp zdQjqzUm?0U`oN7Rgp^+r{<*d_YK#UhfwpiAGFFJOR-@+@qh%@Tc||P5A$@)^R1U@w zPUOc}uzkRSV)?Au-^Kv{ON)!45tK&O-vDv2k0K!kw9o-D#pV|2KynOdH0tH#g#l#= zjwAiP!GQExQnBo9$;(ujS#C9IwA@ro8^k`^+P@Kft?q0q#(+>6McEk)|o>G zlAp(k4$YiT70VN7yk2_e0n3>vmhsiAFJ7qO4pdGDba(LIX%^8vaS*Xvr;Fv~X7__Q z)W7>#@;(mnk0>YYOb~s1&t?ovE2sCxvON)}K#smMfoBzP-y?Zi@cfPm5|><~gvcgz zN%pl$h}%lB0XqVgp)19OU1iqgjHfGLFAWR~1a%Tc*PR*8qWKbefJ{d^nC8Ff%6LB?1 z?7_HBA`gMLvipKYs?0W8t%V-So?0Ronf21=a5GzHlDTV<(`PeFFk4Z&zSOm7-08DQ zfn}wKPhx}D+Es;6%s2j&=>sw`SE0J57 z$5l(#VT^Egdx_lLJQg{#4#IdgxkNUR#@S$+YCR;%{(I7Jz1YL**7qf{4>@=jk~Duk z_*SrkY*>%8as5c$TQf?` z168E65tZZjLy3&5NfXd7_1GwW#<*val^Y?U(V1jFOozlwR5BGQ{OADGr8h?InI&Yx zebI5O7aXY|7)E`)^xRO|TB%fCFcSyCzwjUYsm6$gfBQ=%auXUJKbpWg;CwEb6Cksb zL^Ts|iOfqvJ5K`TQgiz~oFw|PXMaTNoLke;Ca4dRfnydN*3ob{UnYGvL72RLA|abl zn4q7?O6rcyqV75*8z1gvci}B0%r+hCdGy7+BCAA3U#*+HFSY z^qCYMP47goa$ezs*xza1-3gU54UoDXvQv&>G z8eU$Yy^4}f6CJIC;Anr9G}%J4osCy&AcH>F$nq_SVBIycXN%apP2shYzbM?6(A*Wk z@k*5DNIifnyJlveO#mFW*Db}2fTQkpNM$wZc;M`>lObDCOqc5LZj)1$lIVxqU7Xg}e)o{-6 ziw7?47cyj<*rZN2{4mK;est~T68!G`LY8lXb_mQN=eJ=jcPWQF*#^#?{FT(%4pF)D zYl-~KGCkXl))jZ71YPs${ZxPb0Tq-$kv_kbV8A%B6^1guZWjkL&iG2ncVd&)!S_r4 zG8au6P@_h8H5{EO}d{PX&Rd{}h+{LWEdVDW-X%hIP zL=HsQ4}IfsrMb~Ot}l$_avIi`43V6pPBE|vGw5kfr@wO8!UCfa~>F9rtru-xA(e#%DMmXby(VsEQLT`78e zMVOYRh>h6xcBSOI6vVO14o9KPqr%)2u_ZfIEG4>qVq@05W+`#ohoH_5rE+sRc2mZF zHe}U4L|j_0lw941%=kE!k~h?n4@;x{m;%Q(EJatL%>KUI509MYrDWB9gh^|SFu+)O zv?-OZ(oV1r5e{JZlHp2v9zbUn??sj!5Sv)v@xrky6&bPa@rQ&8!7dZ!0d~<70*h2?Xzt&8_ zVAA@a*nE6@pHle~kXS_3D~g@2h$__|X`uImf0ZtxxJ$tKe1y9L+yLOvCn!JKYzl#k z_96R`hLw{qPQsgq{x1%S9UZdauA-$XHhdI@*F#X7Z!lT#IV9S)cETCF$>zuu?UMqE zaNw#m4Aoj^fr}nO79T>RE*(Pl9|8w_hLRi9^ztL`4~Zd;Ia5nz`uZ6C(Sd0OhF-bA zEjO;Fl6yN;|aL#3>Ey*mppo zTq!_ET9ybX%Om`GszfgPMt{>risn#v=1r$OD|9cTGtSAl+t*FkQ<<35<|#) z>P`+J4UfRg2q8U>h)w#~%_^0@K>SKQMX}?TGvP`-(KIv0V7UDc{!|9?;h#E-Y(Ii} z&!0uEAU-S4*`-)dSBBZRz17|7(wtJL*=t{*T$7Gs>ai}296yTuIL{*wsM#%?XeETN zSU|c^bL-b+k_3Fg*W?>&W=D`i)U3Od+@}5k%ZPP4+-khi0@KPPs+2Bncw(_*b~+Y6 zlcUJybg>`f5ltSZgFY~Z)I0_=I)?N-hHy^H$rNg)Ehj5sHVjyS?r^$v36dX1$!!9&$mS_zl^79GY%`v9y{Ntj|Ok ziS-HK&jN2*NEwQS9;?}=Qh6P@_6SqaxDyzy-hunPc~p7f1XhkltRe4DU|tcLNP3=x z>As#Ure@A2a{8n=#;Ra@smxH-QIB*zB{p+N+fn-G2!YmL+9XKxj#46Q7HjJ$A_s_E z3nG%Xx>6u&>@1b(1zP%}I9rGV2VC_Y-w8Nd;Ht(Nl&(E+YTz_9Abu3*MMbF^jwU)9 zxC-P;vZ5`R5uaa!aVZYSw-8mx8NNk&rBbtjnz)5-&p7|qq1dRlafOsp0uO~@q( z!-S7;-GGY&uDV+B0WS6BthT~^(IQ#rL?@S&${1C5z}WvXpI%vFjK&Gd@q$ zC#B=#bjp>FQhcOs6H`7{(>>gnI!At~{IA8$vrR1doKj5qRIFFb`c&*xT=c0p@;P^L z6`zW(JLyB~KJ}~na&_ghca|3s_^J6ZGnTGIeHE6)2^(}LiV8)T zBno!ZhS{k_m^9Xd$JAXsYR^T{)!^Ce%SLl?7&kL+J(U~3RQ~+PGIojgcayj*+)nQd ztAwN%GV?*EK9a0{N|uexitKb>Z9-;6R>_kX0i)TWf2O zRqYZB_aqMNyA`+7na?8pX58wKbrFdJm=%7pCvi|xd7WR5z)k&9`OChaw#jbu9yF`A zP7JAsDPM@m^{i9O`+|2H#1k(vI+xWmI{Us5+i)MAz5BoOoB05>jBLCLl%FcYTx5%O zi|$_%P7`kTN2k$wU(y21+}eMcI6A3d&p&xd<(d%Ja^Ag5c!n>3(?#Nw~Sq_1da4PS`| zzM_4Fz7lJ{VhK9Ff!7bB8z_5~JqP=2)mC~7T?6XkXb=M%=&d(2i0KXVn%Q6bW%uZm zKlKOplwWN~ywni#`lTVdo6^>hm~E*3*3VYgKfBX6e2u5f@BH$IXbRu@I&qB7^PM>O zHF>N2gD!6VZ4^9Bdyp?(ScRX&U&BRgI17G!Q@*~s3Hw)T>Ji5W>X*i}!a zeoSd&5*E>G?H7CZB~Djg-5&l8rQkdurhda39iMhy`VDoH&`4b3y$9&6KCIwA?bmM- z&!63N@E=7|2`G33h5sL!9a#*ohMe7sOOS=}cebnoS?DZTDwkD9!V~koG@cB?IJ;#x z;gyX&%Wq{(Fr7d2j(vA2z-4Ing^YGJjNKE}UabVXn%;v+& zj4DJpP?UX34^SVqbPeMn`KPGo)W_&be&X|{c&`3Q=)Qx^KZ#-AsY%YyVlo^)k9S_q zPWV=?nS<#JK;UZ|Z%TFVtw?Tg+GX z8tjXs_Vw6xdY{<#A2ka5#KHe0&NYuvo(v62hf<*zr?Hq=>LWmKfA<{XA<5zK)c(|D?RDadxqZ+FfXoaqYR?< z3w~gGy@JrH zXPYw+$XyvvrJR?Cz=xoc z3)rjwn!>4ROw3AZ#Ll)vvsrR!(#j)u5*M{!dcx2izSM# zuvmA9j4wJRb{``AVkz43ADbQ%5Pc4_myw0x>ceVL1H%GkXvQE8Q^<3=1mwDC*X%va ziio#MK(s0!`8qEC34_h{4*C<^Pu2K8cOz!J#%5sX z^$Ui@7%u$xBQ{2(_i5L{v4`>a+V5Op=@k%P{hoM9|0+Im6uHG>-;8z{Su`kk4kgCJpeD6yg<(Py3ClGG z#NACqTY}-)CK6U<UHBhOd;vAwJ0R~6cg$kOSG$K*(_VkDwv^V7_m85} zz`Q>aGy0ZcKiyZWDdheYOoNf-M%AV2k3>_SO6-GiQ$ICnNn6RJzh6N7PJpa*1w_vy zO3C^MSUwq*f}Z3aQ7d%)NhJ=kH5i^fk~p+$WlBK4s1q$;v4&rNBr!YIH;`-p;we_Zp8S(KN*x?vduDJIb@u6>iI)w?F$JP)yp+{6%1SNa zSf6LEhN(RP`_?YG%E!>B4~cvl;cw z;FYH3@!i%f=9 zdVKiKNT+2iBv)JhqP6c&7YF{L8?}rOJz5e+cP-|9uO^+C`YZP%Gh0|MNX`&zTj&l_ zMv0v*bO(-6q6NDlb98|HyGZ-ZXk|`cHCo)#nm9Zubxh#QwuMUIekV^1jS*{bGgRW{ z63?$>@$*n?;w8H97?IRQ*m-6#sf|8-(Io*Mk%~NuRno@x-~LRo5xFk=GO@3XdzjM8 z0`&2eP%jaGG%-WxzD$fi%Jof`iMdC~gnpb@ca*WgGEVG0N*7lJVB{bWNpQ9 zjJ-|!1hG!JhbD;K$4F;GwrE#Oo+$eHVZlT($&b5jqL{Dj%@f5MKk;Nv5?hqKevyOso-k2>IzTUR8A) zGQUM^3M7v0mvt5I%tfzBy_IWHF=xL-5Lby)0n&5$DlsHT`DINOw*NWk>^O%cTKHe$`=ug)*YTKv<@<1&Jp7~@My^qGdc*oHb*S& zAoHOdv9Tj@iY{e}2r9d4iWnHef6WvzB}C=NTpQqdg;z&u1H`Hj6Aat6VpE6-g!5X_ z67k0qh{y4lnj4UxM>m-Bk27tr$rXyp#>+{JD7F7SapH~S6nYPpJ1J2>-BU);++oeklRiW zQCXho(TNt6=LL9>BGNfe>r6E3e0gF;Co6c_Hw3sZi45YQP8xb0c0R?V?rt-}Ouh?l z46w3x|5eQ3mxq~AW!)6uxtWN48+KieRU8PD!D?%OcX|?wX(fYB@+fB3%>iaA%uG_m zLnm25Pq{^`IZ0;x`C{8i3b1^JID);XU}hlt)R3h|#!Yk+6VJUp(oLwTvi&t?CAder z5$sA)%8J?>B-dPovoKTC|2H09}c2IS^wQ!bS3xin(m z6CH~_l!=!nS#5zcH=e3CEw3}*x2@J`Jco4G>d_jF?C#amwPA+Zy92UJj4IXotgg_~ z4RLb<>XBfknX3UFxz^NnnLhNNxdn0yT%{WRO3fmo;~JsE{!y`sNR z?A2@RqunoB^x7@D`uoLXgEr6*zCXbIbL2sj#Re@^=Y2q|Gmws4dw^X^^84?}~NLy5IH+0ofbh(d-oe}?!ivY&VTP{U z@%27t2$#oe19baWis|vV7nX^o@mzny^I}uH(w*l;FrM@nSBa75ab4-EfJ}&vX7=;6 zOr2%5*o0gcUM;>mkLwD{g{}woP`MbcSp1^65hjIR49F{0JL9L?Uc(36BCZuHl%J}# zVzZ+2WpS{FHhOHs%Yid@PuD)JTNcY;v5Vb?NSk*v3gCXaeXE|l1X=UTVscM(D1Kd} z6IAu_R8MV;Zo@jUr6=jC;;F`-=tu`Yq~ z%6?mHPoQ1KREc&aH&=;)iP-&BVrnA(Q{E8^70sK(I>o9@VrL@t74vRD@*yzwz;aJb z8?38!i_w}kvR}rBl$**tzbyW0KB7ZMwnZn#Kh3n{1?9QwBe7Z2h7XCU3CJnR=jq$B zMgKf1X{lkhQzLY}h_s|e4DUtanrp-@y->=OkHtbIw|y+u_0sz37DdQ&9KAHXt~6>H zV*Z$0vyacdH(>pm(p4pyhnxFj(Zfh!zK_L7Mg0~rRnfFX%r|O729|C4zpqtuqBgE| zZ4sM{+VJ6-pZr~e$W^X0ws3ELmMjOE1zFiALf2cHqO)!lQ+v}BY~3c_=uPSK&dBcG z#F@r=Z*MZldT&w^_Qp@e*d*N3YQ=O#XRTPG;si7lZV&K4 zaAc~S)kn+Hg|>@F`*59khgjD~o3GPYJ8uIsma z7q>K|RgD@2f6p;8wlVVT5-s=}QjPn~f4fh9T6g=h*!v=?sx;h=Ki)AJ-Iw$%suR=u zs(jRmWs2Efi1(2jKKnw={Sy1p`u%-r{jFb!9{se@{mOR#cf6`@Dn4Zt+byQ|(*_Qx z`5$)+?q1v*kPnI8f^F!hW$0>H+3QC=0~w`0O9WcQEVJQlgU3w*AU$f zZW*9W(&hamMi0ba%}-*wV!_X1=|Ix8m1jQ&;y?A50E>u`M}ChC#6OEQBM0HH-Y0Gu zgfhhcDi#jH|FmDlTIBs2ehWlj1fmiP{~r2NK4)_In>e7ZEBIaLQX!wQAD&7RuW1s~ zQ|bD${}AgHOa2f?QprCb9!eUFKl>3eelX$3{3&J)MsEL8tQbuG9L-|yU@b}4*ere< zOgh5NqU#X+yZ;g+p`oHhO}!$++q@xEj;&R!8$vm>wTXR02#>E92C*CB+5i`i8)}#~G$`HC8x{KnT1xC;99~d9;)eyrN6L_i z;fD(dH}`^|=$599GNh#inG;XEocZ_kG;OM3!|-5qM%GX4N@MuVx+tjL5K; zexhN77N<+TN*owLZRcDiPAH$nS1F$nN4>}%iTA3j#AL43HAY+pcCG)0dzi_Q@Vd5P zHb=}ubOjOi6U`&FxPGORga5VgkVRVayeQXX(c@wY)PIebdNGZ^`Z}@dVlsd0RI&MD z8b$fkpnCqb4=L!Dpcpgg)bTF=PylRzFir9fLi8rdWtQwz?oFCalzYpQma;|2^BD z1xhm`0W^-GcfRtDpv;OspldQSG+1Ye@n%|gzAdO8_IisUqYSyBZcb39MH5nMrdO%H zN3@%XsOcWj?-B~eHdl1UPzJ0+^8@f;IhKA;cI7UV!PN1O1jHPvDJ|1LqBGRiq zFqVmG>XKmOfneg@zFTa6~^8(+B^EkX*oLA%VPZHwD8hbg7WI<(_$Z7j^a4p6fKu) z!@4fs5R`v(7kgLJC5+8N1H$i%WmzO3`(yFGqW@!YFpG>>wuymPpeZq*iU+PBgN5u} zU!hGlG<_Q6rJ+bStshT8@Wr`-Fq?Q9zYj_ty;jepJ(aCps{5`{ zY|Exa6#p2M*GKmVdQQ{^4=FnsjP4#rnttpJPwSZ3FuhJi5uLxWVZ@s*6Djb+av+{x9*ym6T*#i`aK17iG1Ippv^=#Xt-G z+-+ipg$YVsn|M^Qu}!S8Xu~IE9}PzDDgK@&XUzL&N5XW3PVWEXZo$3sXe1|*r;=>g z4<8jnt|A?db|vS@uXZv2DpX_Zu^_uZk*0C*Dpc~YUucuH`AG>K!86OXsx#mJCv*A^ zv1T%1REEU%$*5oE@kq0Y-n(WSs>5_^(U-dJxLTX3%R42uUrm-vPl?~ICaYDa#IPJ# ze@e{Gp$KeJtjIx`d{S)7q1%`i(;@!MVF<0o(CZou$z3|c<=1GVVx77Uu|P@UV>^Vn zM!T@6EtUg&NEe-aeB?fALG(UK4Faatn3z=k|J&Vyd*1)Y-G+NH?zz$MGa`dX(OH8? zYm7doTHhhAoT3tG=nxkhnP?VI(c)q=u{}G5VY(5+N6K)xYlrw{igrV6W48`*>9t6* z*%`W18T{Ql#L8<4WAV9ozLple_1q2~oQup~c(OH4_^;K>u{rS_;-XyqH)FUpR~r#q zdR~WkI#(Oo$8%oCncGD*G1s@#+v2zbM?-nfj?)u@Zr5p-#2OPi#DwdJt{%hP*HNzf z6FbC9NDS2+UDc%KWy#{l>$LM@3wm{k*s1u4H4;hmuv8#q2y3wcae2K|`2RnmX0`Q;PUJkHKvNhcoe* zs+tKV-@u|t-B>a12CmDxRLq94ZI^b4)*U=m?+-F{%M|b5K!=*2*};2!yieFmG~YnS zpFggHH$@_QkV9|Ot~In>-XY_ncepEWRCl=9;{6*5l=a|)H)_LTvnF*!o}&{fH__(v zCy8MHuJ0VS?n=c zNq71bG1;mO?3zENL$2!#o0uyIEtk^41P{ z*WcaK&04nZ@V~^;n`xEFGdkEL>av=?@{60bOAV<79ZUn6HSn(DE!gMGQuc_772cwm zy4GW_>Mo`}NV44sbeZ>w1Gf-R?fo6{0&#z4bnC|;2HdKhZ>WBxgS)!-ZpHJ=Tgl_$ zVzKmA@~B@bHr=Xv_7c%`8Y4jQGh)&-8h`UMV)iuL53dlbls$c=*o@s!y;4Q|W1Mhy(eIJ1(~vcsr5ie<&s^x;_;1Z>N+RJ`}5N z=en|wRGK47fAn^4ml{73!)9p1yT%;ukd=n0vMiW^a^(No!N3{WrmLF4z*+o<_<4pl z$*{PyL-vT?zmA`Ys#^I#_e>(FJSmpW)W*h^$ArWuGik+6T}T|7sa;~o*K@{HbP{<< z0h7ix29ErShA^L(j}1NILaYNvR^lrQxKk-RCnP>C&~7l4C5E^)i>$t<-=Ss2=4c^t z?;WJ=aPN?u8GWW@!yQ^`Y+Rp^*rOb(`-OM{2F>gx;%1@IOH(*aEP4}}J&T|n143fv zEbSUY@(@nxidJ3iEG^BjANyjpDBDNq@6?iGP3MO=>_NM^Yw4vSnQazRH_^*D@8n*F zW#YYeQX%;ZL{}RsRK^ioBcf@^v2p)cS}JBE*VR5JRw;T{h;241HlAa>&N+MA^DbsD z9;cXkm(uao;*GngE$Q>LtnX=I-4;yg+s<3iOx!P%+~H0p29(7XWxV_ zj_(qajEsAh$S;HcWypF~h^cqePvqgYHhN&eD|e&0)f`Ors$e6tk7&GGOEIKzXxZZE zK7=-hF+0CfOrAsLvN^o0?C;)dj&_5t;0^IJa-I84PIaI`r*nv!OFVH8;pE+;*4E$R zGzV?Eu5P0^a1Y@)-xfXQG9a6(#CSz#m6$)5eAd4s*33m|nl_0o*bRm6DiyeYCe!5; zbGgn}Ek@o;JfRvf{a&=bX1my}>{%WWyqA8l!oxvqe{Y`m*YqM|9wqN)S!y1EmF`rD zjvP%=I#0{#x^+)TR{p(kcx)b}uHP$0-bX6t?G?A&M=taBg&18T3Rr$0{a)QS;=p~{ z*scxxIovK;Oud8cpAqx5^Sc%w<=C~Z;$XhkS1g!MNXa3waXwL|a$MTG(dqo3^R*F% zk}&7FsaD3m53i)_ykRkQ0r{{pF(TVxO`?aaDDT9xTUCK)vhGTe%O&M}r^m%|$-Fp6U!5aeuVjUQG{cS+3 z8^g*6@SJ8kEc;4(N} zX<76>DgPk~w&bvQ^dTBhEvrHg(Z|MeIM$))qwc*HlDaAmOPe($T4OgYq=(qu=Y(7u zy?+pRFvO>v;F+XIO*Jf}hY%?z)YkYY!xaNg@Gua=QnI+>VTP%LCJrn+Q>C0j@G#xP zsG%o>c$i5|PTC2v=V9%_uD_X2$hYIgsa*_?NsH7dJn@8lq<^#nu`be5yH@0%kh8|A zb?iRk`9<0Vy1F%D%OdhPuU!1L2(|RRD0)0XqgS7tmeGRZZs8p9)Fa%dSX#ukNAMii zCXPTuO8W_RneP8GF-ctfsKzI}+Qlu85{UhnSn?=6i06dZ_9)$UaraJM-ih4l4|-rOqiV z+9_jHU(ruIxLCU^)|=WX-d&7tOdHlIe!yVZI-Ic2((dygr~1t!#Ei#Pmo`eQdt4hG zdw6uG=&3d|TOZdh(Z!p^uqBKpS!OXA#?CW$idTARy@z`JajqD`+vP4C9xfobPIVE2qrW8>gW#h#I%5G)9zKC%$d$Rah+5N1L z6l*tjExfu@_SK0QFH$Ry7gH;_*N8WYDd5_vopOq5TKz>^v6|f6(kWMW6+QN`4Kd~k z?Xuy<+dKd3#XirlE-|TR$<)^6eq`yli#1PZBa<3&JG~#H<|Qc?bcxx2yEyO!Q_S3% zosoB7Me0&bADfR&JtHQvmTGZB=FKD__>Vr&q03R*5N50{uCK)HqAp_oG1Or3QabeV z0#X@mCvNPzm^;M2rP`2wS$t1K#?(0;`?m=px{wWk~-h7VeiiUsc zDdq{~3p(jk2S0_oc^OLsyc#rhnKn5o{egc9RH;StMO|Xj9uOOsp%w*pbq#miDdOld z<{p&`Id3dFTBkkD1fl8SPBjp!lYH)en#t$1N5t}{Sf*;cEmW#b5sJrz!(c>9yQf$Waonqx+ z7FG+NVYZgNO00T@;&!YO+n!NH_W}nfuy`Kt6l0gG5y{zk`h1)xmNQGMcZ$u($F?EA zm@HC#2~*TwA9Z+oug{XvB_?Kd=b3qdx^JC^EbJ6X&(gF~R*SLEQry#4i|NnOy;O=$ z*5KIfNfLXW)kf&1m5US4()GH_#mG`}l=ULVwW&Emk|-+GZs;4sN19JRRjTwZZ8Txy zXIo?*vF16n%<+;K_#D}2t`M`IBYW8|i&f8|0gKj)-OrJP^7Z1#b0k6kn&`1Ybxs_h zwt}gB$?IbI3QB#^o1MJA&LhA5#r_q{wRjyxTd9g^qZq%E>zy0LjFnnazp9OAiXn9_ zTC8U24dyN}ytcApCE;4$7JFA};|wKlcgjnnH)9u;G1)Gu5>v|9UTx#s%9nDD`l({7 zyQE7DTa?SnsM)M{&)dV(Rm(R8_wx);~`ovp?wM+@8q!K1ZG>QFWU| z_f^bELYq6;JyT~0iR@KODa-jl^UCNwYw;>nt?l1@04%EBTUVjJ?B0F1in>my?c}NM z$lRyj3xuEfS*IGI)cHZPULf2uPbcRKMS8YZUSPy(;496qDph3e_5w{kXHO>&p+%M^ z`#8}&$38LMNxJ;|#0*9AH)4sCLUDa7wkaC-i&IYR#=Z#${%JO9zPMprmza2DH%1?< zDPBzz%*4*L_!;JRZtOb80cH?1)#@KqbBG+4(r*n-Gpn(aVK?$7N!}W+Yr=kBG!IL0 z*LheKTZ3+~S+#Es&9dgFP7X_nYy|WbjB1s?a1H{)cf5~&OiZjL?<<7r5`OFCjx=(9 z%^^YFQ=2-a7Tr%zEvHOl{_N!0jL1eq0ruF!=4k)f&36UEhO(%);PvB~AvKHb<*1Ui zRkSGXZx#Jsq;x{9Vw_@fo0$G0s_AMIOI}1}=N%QBUt|F|yIttk;-A&6kZ*Pw9u zl_=8U-eH!5Bla?7FHZ`qiN9(b+bgwxvF^UaP^n!JYa9?3gI*0x<1T9$vj#ui#*y6Eac&$35^dDAX3o?P!@-dfN1tDle7o708nr;iSEj$z~u zuEC{c#Ac5Pi+2|h>Y&%P>(t3E^IvDRyyVibd`rAKk#Xn9>vXh@Sz_25XkWn<;>I`7 zru|ohnNCMm173WC*we?0&A1QA9v_abdY^uzg0*6G8e}T@vgCG&DHtzwZ=%grieuz|JsiHu#5FxmNM15K4v!gXC@yc5LUH&tJi zEn41WY>J;0mP?{z&&&-xZ^HqHOEzf3hF44uO9R)apVIO$*OntY-C9)K2~(LBP8NGN z5O>pLzK2XY?N^I|Z)s!uRb3mF*Qkrx#Ho#HTEvI5N2}$^UZP+W)A>8z)~+`+%nftu5Eal*eDyXNwcIOO-lii=;1kvi z&gwX@QB@?Qfp1cmMD>1d6$vrkFVZo(DvacT}N2D8{`*w^V2s zv)<804ym<=&%BPXjGU+56{*mjs=)j=c8O`Qi}%%a=7+>V{OjBtXtRm?s;o!E>`jE@ zenhO;q>aDK%tx)0xkmj|<(aHqVzQCls03<@r_a~vZ;CDg45vn3Q61j?(KT+JyHTdHY7^<`n^kC8`-_3zPSnqLUB7!-LcqTp@K zAg*~&yVy|Y40Dd@6X!F%FMm(VGQ_V9vr~C=I4vgl9;0u{%VE}UBkgd^``lM}UlH@) zr=T(`#oG7PeMP0%{yrMfSSfx}%z0IG{{Z*qSH)Px;`L(I2gplb6U!8nU1H-0T-W9j z`xNV57bo!7x8jZPf4%v?_+MROHXs`qT~)ZrtqmAa_Gb9Z3`}W%9ezrY^;Ct=PZfA0 zvJzysN4qq)n^Bw9n!Rocuy}(~qR7rnzYodV{x?=l)dTeusQ`#y@!S{W+G3nV!^5J^BD&p-rR>-6ndLzvnan{}iY#(IDlJ#Nub`Bw0J#(S z=`K~t?0;W>TBa1OaeVyyM6@?@{+p?3LsM9O5}g*-|C`Y}|Bo;%hWXQ$s=4h{Q=QeGqVNP>cj*J%kTia09X@4nY zjojJ&v5lUzyfw^$4UrAqe|^e&Mq7JWRz&X>t3Fk69}6>ej5KuLr<%!-847bia%2iS zq!wlJhs5|=ZhN+d)zl<1u$I+o$s=-3{gXyj7BRC)BQlj<^;3#&M3#rlrg|anG;7tS zh*WhMIiOHMv{#w|`x9a&hx^cGSY%Pl|`PGyO<8_hfXJscJhr zidon`+Rh@L2g4!d?!_RttM$F}PKt|mu!32`hcunhoA^g}Xj5XFd!AHpsP`6)I|w0+ z?e`rj-~{}8v;p1b-P0vzTGYbQKZff)+TimW5>B3es3tWhatg?t$fl=i>0Yi#NIWTC z@DQ&P!>5E3>%wqINz$~F;@r=XOv7-IGL&GrRvBtA-1|9xDtet1LP?r2eDpbMm^sFi z;s@mv-zT~fz!sL2B{2QB!j--%KTv`#nsB-#jxND z?eRXI3r?QtXw?mZWdS4j1t&%9ZhYmWkwvv?l)amrPs2uREGlClg5Kw<3%p#Q1TRD` zxbUR-gapO9FdRZ+$WK4X^IHsh(MQNld+0PPEUn0-wVv_9Z5HF}=?*<+F+H6BniuQWOp2AEq~Uh&m)fvVi^iRl7jli7eDKD9qzm&sKzE1y z>~S%KtQ493@As*{q`TUWo!7F|>s|W2TFO<~m!FKj1s`c{EV0}gog}G=j^jbng!@G* zuG8+esQVm5F5L5Qzdt(RDBi2(>TKi1-o4uJqT~rD&zxDkOgFJd{FAY~5E^|&CEw2d z7Vgz?(Qu#AO*+qT<&%bG`ln-KUybX&W8%(g6K;alhqDTHh*sXW#^ofvHBPh`!?;s_LY{}Erc{aH_{jM?V9{hmzdOCvGFUO z+p%D_UMId&=0eQwsk~yY4e45r*?GMf-as}om!Faq-FPByWQ(+8c0nifKr3v5Zs><~ z&@`q+hM*0`KYL1A#^E2jU=4I$-Xfc!`uO)ow$Ze68%089wN>3F@ z4~8b?@%k1Sh8~#y+$rgU+0c7ai!6p7Ym2Og=9^oj7doJR#VM)31^>_jbD<9|f}Uyk zhqf7n5A_8tGHvB4X}_~Y&V%0BEwXB5(J5)Yn*b45=eF>91w7nK#87{Ki!4-pphcD| zKG-5X(6X>araVu;?+_vMZEBH=q4(Vu>4KJzTVw;Y!!{*{>8r@#mKK=@?a&TAu)GL^ z^^+D^3ti9;&0AY!#tRf7oCj_HCZ*6`i`+>-yIW)dG=EFcLpR(Co!?RP&<_(<6T$u# zIS2Z_$DiT>;(^v5C=uljixSpQls{3#&L(%4 z4U^Y$L4W|DDM*5#50*lGh=`!;I5~caaKbHYUlC#dR@nx9DXlWSf{YGnmHE&Mixr0# zwaRJ?o{L&#Gjyl7%Cwh>U_`6Tfws$A!2StLff^iG7Rmv5%DWT46~spzg5nI?gv|Cg|Zhd;)@bWKtd8KDTyaqWgavxZIz|U z4y&OH`k?PA{Jly7N?N5A`qs8e2lQ@im2POe4J2T2n=FP7=!W_sZL$$Mq5du8rZ$-at$ZWG3H^M? zp%MD{Rzt$ZB0P-bc>0ZS6ds^=bepV$rpz`O_ckdW*Cx%-pVcO9(0fIjEQhx7ZL(gu z=d{T%bWCZJ=~W~smxMz*EQWr#0XnWHV(7Yoh~MG58{4F%2!s8mHn|wOt!;8E^utEz zyO|tpA|ZUpzyv*fpuh$#GuosJ`e))#$qPv6yF_qjo6LvSIc>5Mn(iTi(7uQSDlR60 z)%b_gpmjN)f~dgYT!}~M7I=i-Pw@C29{CVJ0W@!?3ZQ#un{0r77>2$tD4F;1w}*s5 zD<2AQLf3a~^02b+r>;NXI%t8WW)cSd&KLsze(vI!b{AC<|QDT$<`G6%Z)aN53N@=@uA`oTw~ zU%A7KkB|@Mz%kZPp2Yq}Vz*Iv5Mje$lXutHRbU}R<5kLo@1u%b%e8N#V2bw1z zm9@}!3#Z;I`>jW%ehUfCKPn5M7rLOH9)We65on#u7tF8J%H%!THbqq0P?4Ufqfi|dbKPro% z6}q4s)+zZhB7hE{m`?ch!mUJ zWg7w?hsT>dq-H+GVM-k7$=$6-T$rW~d+2E)zb7m$XX@bY9*rOB5$?&N?(* zMR?Gh+b&ag!t2}RG-$lBT~erN_fa-zg!BU` zV37ku2!nMqIrEYtm<=5@d`bZtKc+;W^;1#~-7x-JN}#q~W9g*e(x$Py2UaNI1ZSKawNGLsS8@ z9_CvE(DV}?q4#Ipe;}e?NEkHvXok@HJIxeYn@Lb334#{IzeqUrwV=$<=%)msB}9UL zq-2hxD@7O_CkO~y!{iY9^v7h}K~j7U#}z|2EQF48kI5S7JpY&sD|ZuTDj&jq=rL)7 z=3&QVDRf?VOx8o=@MF@i++pfr!oP^~;h`Tcg3k0~vdD!&KY|OO1vWz~jQNQ=f=1|u zY0%E64Du8&J|-)m2UbHrY=E|mW6}p5Fa%9}+92&`N*Lxr&uAi6y!M!Eg1&2y70ECL z`}N19`4>_IbD%!&n4AaAuoyaEC3Hi#a=+o2^gWh6V(t)yHHTw8DfIiVWsJKg@^vHM9a~gbrwe<qn+Tw34`ab8 z5(K9~&zCeiXyGdsTa_I)L*rK{p+pWdpc~qtsR94c0lmuoYf27Uza_pHStM=Wq1`c3 zTK+@V*+ojzej~}2NJP+bh^mQ| z(*85yK_@JSZdestB&7#KEdnp}LLY2`mS1T9dJ+Otp$(d$9p*qEoTKbMlomRn3+jI* z0ZI;=p${e)2G}&fw6)Rx9T-fg&;wKA2{4umpc6Wv54xa5&w26CWbn(l^9TT@L3>v|QULwkIQy{&?%lZ_x}g&q z&*fxUXoLHq6NaGY+#)XMiQzoIv_O9kznrJ+J^iu_>J$937CK;)vM2gwLIOny&CsIx zh@jEvmyOT~;}gjM%z(z;ewhbtaIumXVc3Adk>r>4%7G6&7&WROnUnCL zy)O|!4=jb|etuaE-Ecp&^!H1BFYGWK`d}`!q)-CT2FsLv07VY11Iamb4D!oVqm)G+ z47nH_seWmPURbU;#4kNiZz6);1O&4chx%nPv=1ZU&;{$DANrx=0uqu$LSP)o- zywEQzp$i^{`r%Y{AKJeYLrx!Zb`b$X8!U$|Sfk|WTmYR=pG?3o1KLNDLuDWBmu~1B zgAPF#Oz2C3U@kP7{jvZ$UNCAIyQ~d|C-~!7}KcLBL8r zlOlyq7(axF3P=Dn-$4~XJ1m7BSOfL5{6*4-!8wZv&L_ur(hL=CL;&5;2~Bqq0kpy< zsGm&^O#}$d(0n%$KpS7xD1|;)13mYUAn1o-Xq-z)4kbL823>QDxDbN}!6N9mmmUfl z=TW544%?I*>W7gaXoAN3{Bjz!!&2yK}; z0gaE+9AWBX=oYj>H!SjFh`We@7W-vBv^`G5(7J?#DLW;UnvQ!h$_dR+(4L`jDJ7)r zPf`^l;Bvprg!)pn9{ORal0S#mLp!X64(Np**gQfONy`fAWF!&5Y-n3a3Zdh9dOA3E z6&FB1Ot_c}U!a7b4=#qL)kFaOYw)Ms1w{|7<&;ncC9oDHh4z=I8t8}mQ6#K_=02(j z4;ZE)u)a(W1zoTX>eo@^&<(RjQ+Ka07(rtt5kfmW3~jHHfid`7PX?g=HR>AL;0EY} zZP4nXnVSjcby^X0!Z}6C@CHQ#eef{!ze#Jj1o>M82p#XxGeX}c0)&=#sbgq*k2)TU zKbQ?&9}plkxhXklhh8OzF_&^ZG(q!+lnk`NA{z!b1_#t{W|)Om=z)IN3>_b#oSE36 z6&gQA*`O18lza;%1zn#|q?h5ol~x5E+t7BX{}g}Fvz?+JM*?=Bl+d|@_CF1S`nEth zbnK*#pbzRVCxyF+70d&CR zEAa@;&Q$srACMO4gf?h4a3Z6!!)mDS8jxOPhau>P$&*QFHx3YnF3x(kL*qFCSpogf z4>`YCCST2U@c~%^-RBW8boB^GeNGW6=^2n|IXJ*P=!0{hg#(yNp$k?)Ya$6yb{KOF zMXm*8Ivmg|zz|MR!V+lZeB`ar0lm--{m=*Fr;xxT{6iO9T!i6qrR9 zgZhif5Ol&KF9usiK!%|YrcNbiqbO478%>1JH6|b%px#V?${iZ7C&6O_G7DOv75d;} zXwKv~VCaNiXuFJr=20ck27TkGT4=sJP$X-V;Bs;V9a#YxcLU0D1v!V7@f@EG9k3YM zCXgU#&n7`iK9Rb;5r0>bVCaH{&}|_B(0&yqs@yq(Ht!}P%q4-)GZmeJj_Zr?pd9i7 zGQ*0;8v=43G~E=CPN=t1m(T@6&<*2nCIFZUz0eGOFbA4A(sd3r-plc%&;qNWejbPL zK{wRjLP;MROuBu%ddqz{1? z#@|jr>q#lJLo4(^yOO`gg-Q-gC*UiB zvIKf8L3tSZt_sTd1r+7goDm28Ia~-m*WeD5t_#Y{Lga8Bv|JyQRnP(Jp%=C(dtOi` z-w$sI%6w?Ql@rIH7rKwPI+UNl7R>N27(HAErJ?gtLM& z7rJ2~wBAVqp_8N0_CpW!LqANg6VY89r3Sq)ADU(d;s&^~*_+VksGbURVR&&yiE;UlEkX$Kgu+L0=gOhMwn105raU5*A^wIPtiIqJ){y zy_%wf=5>?^)UPK;(Elz0kzmhz1O#m#P*Tw0CPL_iF~#KULv%!OGtCV;KEj`}*WeH8 zKSsHqAYNFMj=_x~5Bfeq=@kD>^MU^DG_R#ZxP#`U=)oQ8Kc`Ni8P-BKJPb|sMEoQ< z`x51ZHdqY(uohbO;t%@1#h-(4_6Lik4THUrqK3YMC>gZ;LdWrx>Z!;nv@}sv&<7Ki zQ5DC6(h4mh+@U?DLvDq>ZXGfV&F6H;w5LfxLWeAdmPFj47aoTGqz)Nhf_tB$4w-?$ zn9N~F&M65W3W5J+@ z)2vFN#nd5dpcl45>(CCF{w($)3^okb3y27sMs-MIDRnlQW2>NLVuvh;rb!*L7CK=Q z^uhS&poNH_<0?-2f!_N#FA3To?2rj7aDSMLLDO>Fp}Dj}?uRZIhDBZuSTe69#m|#7 z#TPnc8Jy4ANDZ)Gd56@ON!h>=Mw!sGo*Y5*YaM7b{@)-W&y#@-9nu0#Rg?@Ixv7Jv zVJYHwi4a=c9n!Q4en^I)2bQmrMbfdGB2j`<9Wv(yQmhNfVrUt}fkM!p8j^>hcW_9? zJE^MkLoyROhK6JzbY2*ebx?m%NT#kP0qG%`1I;5svIM$E;tnmNa9=|_qd7PRdZBv_ z&E^n>It11+AsL2#bBL#2DWb6<9xNxo%n)Y;6X0bW90NV$Lo&ad0JDhz`d~G5P73j~ zEB-AZ_7dS$xI-VTfsV-`*`(}Ohh)N9+^-2qa}fqFhJ5I}HY7JF`%NLHS>$L&h)qXw zFq51@a{))-K*t>fxo$24=wkGWCOG=AVbh}KM8u7s#qA3rO>)E zBgeD<*qJ%+`|5QyJR(X{9BjYfnyJJ$qtpjwoA?qF+;vC zxf0t0U9t|x@g5v_m>qFd;OQ<|xmC*h@oMaOj*9U9x7m*u?Cs(!I#LdLpNjAiyczHP zs7toui<`To_8-jHr(M#8SF}~8#Ni_e8N`;O z3E7CBLdix7mba-zQ{5<^Pb7?bv&Qa<9_CUlM#tmHd+o{8j0b z?q3tKl?-oJLKg0zBa()(XHZgZ#Fn8+c^uoZiCtLxE9so1bYcf~~uzf^Q z?!uM=I;1?B4uo+b9l(w;Nj{3H2X|t7F-vZ|PJOrrJH{qu9S-9rAAxH;BYJ}lU^@=8 z(oQ9IoST&EReDlVMzE(eDJ>0@o060c?46qASL~FVmXzCYWG?%6F@y71wFJlJGcatq zBq^J*dtp*$y-7J-hNG29>07BXE@i<5>{!H9VasJnS-P78a5WC#HMkCM#8JEhJE|C| zN-t)FZ{gq2A?$G{Wj(fEk(6!NeI*0hLq4v-_N7U=2}iF=`eYk{dl@}@n~q$QlrC(k zN%A{?8pP|AuVbo|uV;ksFe5iG131p2H0!bJW=;wA;uajj9XN_J-{rVlm=SEdH7Ntw zy^0ydu~ojLY$vea#_~sv^bi+f+g&t>qqyKbD)2G_?5?E(9Ks+m4R+xrm)NRG(9JX;uh>+{gzG~dxmoq;lRzT1A=3J z;vC{QZo}>uxH$K59A1DOf2Khk$2(O17CMGwTaz+>KOJTfmP+iXr(x`UiTbe5@-kDU zc5J6-f1}{5RD>ftlX4IChUvM=#|2GPjA!FWLsG89-d$YHIE=$Mj-xpACIeC4&9(Fa z<=>)S?A^n)fgLQI68%8>_-+Ox`H-n^q+;xPk156xT#vnbnHe0$o!Id{9r=in;RV>k zN-C>x0Q+$ruEXwq3;>642lnh|CYl)lF2Ql^Z1&Mm69=jtxDGp-xfVXAz^7D?xCzNNIl%?2mm~)E*ZCsR}((%7@v0_JzhOze;4dV#Qo!Ad>?!V(W9R7i;AG=R* z3VZ~P4k|uKgFn$D9Qv7agl%ye#&x(A$9~~`;xj7zm5U5Jx|k{K!kcg%-i1RZ9XiB_ zC9fuBVM;%a<83%%VLb$uKhTuE(kK~&Ot}VI2AlFYj%JzC^`DF^*OWo*&`sHlqd1N| zHdB^ajd7PgR&9XN&;V9#h%da4(8_^$Hg>bL z#5(L?Nr|vZUvA1)9K~7xMSY8@57*(P*yW}?wlAS$IN~G7{DvM^GeR7~)!4F}DaElB zrrd#T*P8tAh>l&)G7E=k@J1@c?wjc#4%|iuapZPR5e}_p=GrKCrzvZ2otIOCBfn)o z_S|psRZd?2;kBmRND%oQQ;6NH+h9A&2=Hv|dWZ^C8V7JFK!>pBVMdDU9$~SJzfJ$JoQd z52eSLiKkfF1;=qvxsHnr2R3nP{=tmlN^E(Cjw`eF!!8`cZE8Oj|D?X>X!oB!&c&Zh z=_JDuq{le6#pFAh%mfQ7wByKjMs}Qr-{92Xz-}%s?0B0g$6nlq!|$+?#kVxrNC$D~ zJ!T3!_L|%kQ4j9K_J}FVzGHxRsgEGCpQ{6EyfW?+@7h4IRRkqjcn7re2w+{CPZ#dq8HggBiqS zICR4FNuG}4K|mh+$B{S}6^?asZhmBn5{v|gu^&hAW*oyiaNML~>`0lC$3CPNmty-V zrWU(#Esjd+$8{nlqu9Y?`_7-a2KxI_vW6f;h96rjDc*kBftzpy^Q|eqaFmi+aVo-P z*fKCBm*OxEs5H<0^XpxHfSFQVyRdB}9qQzG%>QepFG{eKAdYLXcT`Gl!uHWA*@z?fIChLl@y$pY!fS9G zhp@ef5#tbU#}S;B;2amHB&%lf{UPedv2lzTdnQt^@;NCfxc}sPJ}Ft4q-Q*y?!@uQ z6vSShH|JhLhHwjx;ZE$H!jcOn`*8{OOijsE*v7Nvo3VX*O19zv=IqHZF2E5?71CA4 zOyU^coT{dWGg5L784lcv!?ROz!6~Nb0;Uc}aUBj^$Vjo{B1Vc`3#m{_S&OT1OC>Xd zJ&RJZ9Nn?@SWv-Hejr&SY9ot+R=*r;R}4MO2?O%xATg@-LO z$7sS*q={iL4yM6{umV;=@M4as)I`B$nuuPmiFIyGlr7Oj>?%!EUQGw9HL(VoKrE+j zC<7O(UY@0oTIeV4Pv_Mm@U@_xVQ(a_aRueCp?^GduJ+fMxgrQ{P?*XkgSJ@ou;6Wl#YN z!3|f#3b+nd!QHSH9)`zZBRmC9!!xiMUVv9147(u$pTZ$%g=5eTCn4qIpUl~s$bsQ7 z3dTVRIA8|2;9{tPd2*ao{G0tex1I0ipL3I<)k41jMW|BQum zU>eMU`LGbG;7V8yH^QAz3lGAh@B};ye};P43A^B3*asiOA@~}O!3p>oIzi0Q#NavF zH9{lEgCdv!Q(zjLD5W!4Z58+;SOk}Y8uo`23C*x9?iaUQ)U{XX7}4PG^kcprS&eb7kX0pi|mC!Ol2YGyl3enxks ziGLBRw-2>lM;S}HjF~Q@YRH_ciQa|f3W(LQ>g{L>@j$FLCo%6t-CI>p8K3ZzTtQ-R zy3$p|L(;LYr;s|J-;=#D`$jMz^&Y3T7s+R#_U`R1;_P(X+*_V*d`aB99Z%;Ml98C} z>*=Y5gvFU2F?&W54^3B8N}Q99XAx`ZxPjAQO~+3%E4k_T1>)XCBQNZ&(=WusO48&I}&H6cRa-OcCVU5x|95Z^vH@C@yK+1iu_UO_!}x6oj&## z;=*)1xcqd(E2(o#IzNy6qMl>>i?QX#+Oc}kvc(*z@}{v*$Cln5~idxbO@OoT}gAjoZs3)%=hlarmo1qy@7_V7bkIE_nn$IfsQ!obgWVAt+ z)hbG$3NkL_@`YOBbMR)|3Wd2;47D(e{2*R}?}A6*d1xjt4kfx(tb|Rl2VP=d3}@M_ z;(Z^9QUW)8O}qy0f;b!@-2vGb@g@%o;99T^qxVn;&2S%i8$jeSGFS!m&RcvKn zDP9S6pf-QBTcW|7plA2*mtzb$S2wJK_3m{W)q7j_3n|;!d9HqkFOe%6x#M%?X`+RA z4*m^%dj@xbxDglN^C3vNaomb_Q0{WlmGB^WOYmc7au;?M@BPH{$a5ySO9Lly4O|B2 zu+KxDt$<0Vd=PwX+;RrtI|_(=VsMV&z6@eC)C`3*Y6n4?Mk;B8AULV0YLq5cf`25h zUtBnv{6ZSV0yp8Hij~>Fg5zew6!7P13q>_SB|EOCf*KI_P{{%c&mJI3=k$CuY)lLi zerS1JRi=v&`1cJI9rJTU+q@i6Se_$-lZ;y?>&3oIU1XiDi^g$Ktcz_>4 z96F!{G|CHTBX*9}MG*UG!a`mqbdnBo{3h54>!9zp_Y)yLgQL)pRjOanFT<+ymN2rx zKSeL?H?Vp-CwGuBbgI61;I+i9>G)RSwsbtBr_4K3^+ltP5uboxL2b{>QO*#-Zd=rw z&B%IU^>+6%R&RIPvHHAJHwb=86dQhro}aP3+<4ZZ7b(A^94P>D}eV#IzS- zZG1^jlhnjECXYfJZrrZ?akuk~=JWNe zj6>!8Jvse!;?!&s9cs^4<;E#h{0LT=Wz)(0wmUP=s8)#+<;Ev7^x@gp^sa*_#82hM zhUpxHIXKT~Qsp|+iFTDZl}>17BrFxis4|M20e@rCv;@a#mm6dmALg zTfId#siWVgWi`8dNErF!_@`xto}C>$UH;1oDebxMSNO@nbr! zBL1a2F3Lz&7+Yp>5UYAp?o6X?mY&(07_!lxDS}4^i2?JBp|ka(I|lHv*1Khh?9A#T zJ?W}q^`_Ue?c`RVcgqk1GCluCnOFLh>Hq(i;nEyqG|txZ`{f#ArZdqUv-PZg{ADcS zjMa0EJT4ET>;m$}_3W=!@nqxL8G3$}^K_TWRL=11s=24L=67cqP3nLp#uMjLdAo`| zJ+U^2sjHo*X1u8XhI#7Kf28ln3A~tJ>bYgzL)^RVn|L}t*(bj?{o+>T9_SN))u+ru zeexge6L07f*Y(8ehERw%`!u$>Pocl`iMREMU+WXc`ZWAD`8%fdyxKnMQ|4fw_zU8y z7kdWWFoBPa-tC({g?3D6xM!}uI3pv!$oQ~aA2r2!GDj$O&FHz2QwJ5Mk1WS3SG}Kg zPUcnwBF2acee~cxTXTh?-6*fn3kUL{)c9SlaZ`m}G>{M0cvr6RM->m@jX9BPG^jY6 zIF)N0PRADFLG#ETstenxTw^kEW)8=)0;0#5GEdiyRrB=h0aiXE`tb)J_2L111O#&U en~(b0e29O7ER-)Wo~WQ~mP@y) - + diff --git a/TMessagesProj/src/main/java/org/telegram/android/ImageLoader.java b/TMessagesProj/src/main/java/org/telegram/android/ImageLoader.java index 714479e38..818343f03 100644 --- a/TMessagesProj/src/main/java/org/telegram/android/ImageLoader.java +++ b/TMessagesProj/src/main/java/org/telegram/android/ImageLoader.java @@ -21,6 +21,7 @@ import android.os.Build; import android.os.ParcelFileDescriptor; import android.provider.MediaStore; +import org.telegram.messenger.DispatchQueue; import org.telegram.messenger.FileLoader; import org.telegram.messenger.FileLog; import org.telegram.messenger.TLRPC; @@ -50,9 +51,8 @@ public class ImageLoader { private ConcurrentHashMap imageLoadingByUrl = new ConcurrentHashMap(); private ConcurrentHashMap imageLoadingByKeys = new ConcurrentHashMap(); private HashMap imageLoadingByTag = new HashMap(); - private LinkedList cacheOutTasks = new LinkedList(); private LinkedList httpTasks = new LinkedList(); - private int currentCacheTasksCount = 0; + private DispatchQueue cacheOutQueue = new DispatchQueue("cacheOutQueue"); private int currentHttpTasksCount = 0; protected VMRuntimeHack runtimeHack = null; @@ -84,7 +84,7 @@ public class ImageLoader { httpConnectionStream = httpConnection.getInputStream(); fileOutputStream = new RandomAccessFile(cacheImage.tempFilePath, "rws"); - } catch (Exception e) { + } catch (Throwable e) { FileLog.e("tmessages", e); } @@ -109,7 +109,7 @@ public class ImageLoader { break; } } - } catch (Exception e) { + } catch (Throwable e) { FileLog.e("tmessages", e); } @@ -118,7 +118,7 @@ public class ImageLoader { fileOutputStream.close(); fileOutputStream = null; } - } catch (Exception e) { + } catch (Throwable e) { FileLog.e("tmessages", e); } @@ -127,7 +127,7 @@ public class ImageLoader { httpConnectionStream.close(); } httpConnectionStream = null; - } catch (Exception e) { + } catch (Throwable e) { FileLog.e("tmessages", e); } @@ -152,15 +152,27 @@ public class ImageLoader { } } - private class CacheOutTask extends AsyncTask { + private class CacheOutTask implements Runnable { + private Thread runningThread = null; + private final Integer sync = 1; private CacheImage cacheImage = null; + private boolean isCancelled = false; public CacheOutTask(CacheImage cacheImage) { this.cacheImage = cacheImage; } - protected BitmapDrawable doInBackground(Void... voids) { + @Override + public void run() { + synchronized (sync) { + runningThread = Thread.currentThread(); + Thread.interrupted(); + if (isCancelled) { + return; + } + } + Long mediaId = null; Bitmap image = null; File cacheFileFinal = null; @@ -198,8 +210,10 @@ public class ImageLoader { Thread.sleep(delay); } lastCacheOutTime = System.currentTimeMillis(); - if (isCancelled()) { - return null; + synchronized (sync) { + if (isCancelled) { + return; + } } BitmapFactory.Options opts = new BitmapFactory.Options(); @@ -235,8 +249,10 @@ public class ImageLoader { opts.inJustDecodeBounds = false; opts.inSampleSize = (int)scaleFactor; } - if (isCancelled()) { - return null; + synchronized (sync) { + if (isCancelled) { + return; + } } if (cacheImage.filter == null || blur) { @@ -270,31 +286,43 @@ public class ImageLoader { } } if (image != null && blur && bitmapH < 100 && bitmapW < 100) { - Utilities.blurBitmap(image, (int)bitmapW, (int)bitmapH, image.getRowBytes()); + Utilities.blurBitmap(image); } } if (runtimeHack != null) { runtimeHack.trackFree(image.getRowBytes() * image.getHeight()); } } - } catch (Exception e) { + } catch (Throwable e) { //don't promt } - return image != null ? new BitmapDrawable(image) : null; + Thread.interrupted(); + onPostExecute(image != null ? new BitmapDrawable(image) : null); } - @Override - protected void onPostExecute(BitmapDrawable bitmapDrawable) { - if (bitmapDrawable != null && memCache.get(cacheImage.key) == null) { - memCache.put(cacheImage.key, bitmapDrawable); + private void onPostExecute(final BitmapDrawable bitmapDrawable) { + AndroidUtilities.RunOnUIThread(new Runnable() { + @Override + public void run() { + if (bitmapDrawable != null && memCache.get(cacheImage.key) == null) { + memCache.put(cacheImage.key, bitmapDrawable); + } + cacheImage.setImageAndClear(bitmapDrawable); + } + }); + } + + public void cancel() { + synchronized (sync) { + try { + isCancelled = true; + if (runningThread != null) { + runningThread.interrupt(); + } + } catch (Exception e) { + //don't promt + } } - cacheImage.setImageAndClear(bitmapDrawable); - runCacheTasks(true); - } - - @Override - protected void onCancelled() { - runCacheTasks(true); } } @@ -402,8 +430,8 @@ public class ImageLoader { FileLoader.getInstance().cancelLoadFile(fileLocation); } if (cacheTask != null) { - cacheOutTasks.remove(cacheTask); - cacheTask.cancel(true); + cacheOutQueue.cancelRunnable(cacheTask); + cacheTask.cancel(); cacheTask = null; } if (httpTask != null) { @@ -743,8 +771,7 @@ public class ImageLoader { img.addImageView(imageView); imageLoadingByKeys.put(key, img); img.cacheTask = new CacheOutTask(img); - cacheOutTasks.add(img.cacheTask); - runCacheTasks(false); + cacheOutQueue.postRunnable(img.cacheTask); } else { img.url = url; img.fileLocation = fileLocation; @@ -791,11 +818,10 @@ public class ImageLoader { cacheImage.filter = imageReceiver.getFilter(); } imageLoadingByKeys.put(cacheImage.key, cacheImage); - cacheOutTasks.add(cacheImage.cacheTask); + cacheOutQueue.postRunnable(cacheImage.cacheTask); } cacheImage.addImageView(imageReceiver); } - runCacheTasks(false); } private void fileDidFailedLoad(String location) { @@ -823,21 +849,6 @@ public class ImageLoader { } } - private void runCacheTasks(boolean complete) { - if (complete) { - currentCacheTasksCount--; - } - while (currentCacheTasksCount < 1 && !cacheOutTasks.isEmpty()) { - CacheOutTask task = cacheOutTasks.poll(); - if (android.os.Build.VERSION.SDK_INT >= 11) { - task.executeOnExecutor(AsyncTask.THREAD_POOL_EXECUTOR, null, null, null); - } else { - task.execute(null, null, null); - } - currentCacheTasksCount++; - } - } - public static Bitmap loadBitmap(String path, Uri uri, float maxWidth, float maxHeight) { BitmapFactory.Options bmOptions = new BitmapFactory.Options(); bmOptions.inJustDecodeBounds = true; @@ -851,7 +862,7 @@ public class ImageLoader { } else { try { path = Utilities.getPath(uri); - } catch (Exception e) { + } catch (Throwable e) { FileLog.e("tmessages", e); } } @@ -865,13 +876,13 @@ public class ImageLoader { parcelFD = ApplicationLoader.applicationContext.getContentResolver().openFileDescriptor(uri, "r"); fileDescriptor = parcelFD.getFileDescriptor(); BitmapFactory.decodeFileDescriptor(fileDescriptor, null, bmOptions); - } catch (Exception e) { + } catch (Throwable e) { FileLog.e("tmessages", e); try { if (parcelFD != null) { parcelFD.close(); } - } catch (Exception e2) { + } catch (Throwable e2) { FileLog.e("tmessages", e2); } return null; @@ -912,7 +923,7 @@ public class ImageLoader { matrix.postRotate(270); break; } - } catch (Exception e) { + } catch (Throwable e) { FileLog.e("tmessages", e); } } @@ -924,7 +935,7 @@ public class ImageLoader { if (b != null) { b = Bitmap.createBitmap(b, 0, 0, b.getWidth(), b.getHeight(), matrix, true); } - } catch (Exception e) { + } catch (Throwable e) { FileLog.e("tmessages", e); ImageLoader.getInstance().clearMemory(); if (b == null) { @@ -940,14 +951,14 @@ public class ImageLoader { if (b != null) { b = Bitmap.createBitmap(b, 0, 0, b.getWidth(), b.getHeight(), matrix, true); } - } catch (Exception e) { + } catch (Throwable e) { FileLog.e("tmessages", e); } finally { try { if (parcelFD != null) { parcelFD.close(); } - } catch (Exception e) { + } catch (Throwable e) { FileLog.e("tmessages", e); } } @@ -1005,7 +1016,7 @@ public class ImageLoader { scaledBitmap.recycle(); } return size; - } catch (Exception e) { + } catch (Throwable e) { return null; } } diff --git a/TMessagesProj/src/main/java/org/telegram/android/LocaleController.java b/TMessagesProj/src/main/java/org/telegram/android/LocaleController.java index df53514f4..c582bc0e4 100644 --- a/TMessagesProj/src/main/java/org/telegram/android/LocaleController.java +++ b/TMessagesProj/src/main/java/org/telegram/android/LocaleController.java @@ -296,15 +296,15 @@ public class LocaleController { } } - private String getLocaleString(Locale locale) { + public static String getLocaleString(Locale locale) { if (locale == null) { - return ""; + return "en"; } String languageCode = locale.getLanguage(); String countryCode = locale.getCountry(); String variantCode = locale.getVariant(); if (languageCode.length() == 0 && countryCode.length() == 0) { - return ""; + return "en"; } StringBuilder result = new StringBuilder(11); result.append(languageCode); diff --git a/TMessagesProj/src/main/java/org/telegram/android/MediaController.java b/TMessagesProj/src/main/java/org/telegram/android/MediaController.java index c617e56a5..829b29ee8 100644 --- a/TMessagesProj/src/main/java/org/telegram/android/MediaController.java +++ b/TMessagesProj/src/main/java/org/telegram/android/MediaController.java @@ -492,7 +492,6 @@ public class MediaController implements NotificationCenter.NotificationCenterDel if (currentMask == lastCheckMask) { return; } - FileLog.e("tmessages", "check download mask = " + currentMask); lastCheckMask = currentMask; if ((currentMask & AUTODOWNLOAD_MASK_PHOTO) != 0) { if (photoDownloadQueue.isEmpty()) { @@ -582,9 +581,11 @@ public class MediaController implements NotificationCenter.NotificationCenterDel } else if (type == AUTODOWNLOAD_MASK_DOCUMENT) { queue = documentDownloadQueue; } - queue.addAll(objects); - for (int a = 0; a < queue.size(); a++) { - DownloadObject downloadObject = queue.get(a); + for (DownloadObject downloadObject : objects) { + String path = FileLoader.getAttachFileName(downloadObject.object); + if (downloadQueueKeys.containsKey(path)) { + continue; + } boolean added = true; if (downloadObject.object instanceof TLRPC.Audio) { @@ -597,13 +598,10 @@ public class MediaController implements NotificationCenter.NotificationCenterDel FileLoader.getInstance().loadFile((TLRPC.Document)downloadObject.object); } else { added = false; - queue.remove(a); - a--; } if (added) { - String path = FileLoader.getAttachFileName(downloadObject.object); + queue.add(downloadObject); downloadQueueKeys.put(path, downloadObject); - FileLog.e("tmessages", "download file " + path); } } } @@ -627,7 +625,6 @@ public class MediaController implements NotificationCenter.NotificationCenterDel private void checkDownloadFinished(String fileName, boolean canceled) { DownloadObject downloadObject = downloadQueueKeys.get(fileName); if (downloadObject != null) { - FileLog.e("tmessages", "check download finished " + fileName + " canceled = " + canceled); downloadQueueKeys.remove(fileName); if (!canceled) { MessagesStorage.getInstance().removeFromDownloadQueue(downloadObject.id, downloadObject.type); @@ -1349,6 +1346,8 @@ public class MediaController implements NotificationCenter.NotificationCenterDel } public void startRecording(final long dialog_id) { + clenupPlayer(true); + try { Vibrator v = (Vibrator) ApplicationLoader.applicationContext.getSystemService(Context.VIBRATOR_SERVICE); v.vibrate(20); diff --git a/TMessagesProj/src/main/java/org/telegram/android/MessageObject.java b/TMessagesProj/src/main/java/org/telegram/android/MessageObject.java index 5ec21af4a..6ccc2bdbb 100644 --- a/TMessagesProj/src/main/java/org/telegram/android/MessageObject.java +++ b/TMessagesProj/src/main/java/org/telegram/android/MessageObject.java @@ -216,7 +216,20 @@ public class MessageObject { } } else if (message.action instanceof TLRPC.TL_messageActionLoginUnknownLocation) { String date = String.format("%s %s %s", LocaleController.formatterYear.format(((long)message.date) * 1000), LocaleController.getString("OtherAt", R.string.OtherAt), LocaleController.formatterDay.format(((long)message.date) * 1000)); - messageText = LocaleController.formatString("NotificationUnrecognizedDevice", R.string.NotificationUnrecognizedDevice, UserConfig.getCurrentUser().first_name, date, message.action.title, message.action.address); + TLRPC.User to_user = UserConfig.getCurrentUser(); + if (to_user == null) { + if (users != null) { + to_user = users.get(messageOwner.to_id.user_id); + } + if (to_user == null) { + to_user = MessagesController.getInstance().getUser(messageOwner.to_id.user_id); + } + } + String name = ""; + if (to_user != null) { + name = to_user.first_name; + } + messageText = LocaleController.formatString("NotificationUnrecognizedDevice", R.string.NotificationUnrecognizedDevice, name, date, message.action.title, message.action.address); } else if (message.action instanceof TLRPC.TL_messageActionUserJoined) { if (fromUser != null) { messageText = LocaleController.formatString("NotificationContactJoined", R.string.NotificationContactJoined, Utilities.formatName(fromUser.first_name, fromUser.last_name)); @@ -453,7 +466,6 @@ public class MessageObject { float prevOffset = 0; for (int a = 0; a < blocksCount; a++) { - int currentBlockLinesCount = Math.min(LINES_PER_BLOCK, linesCount - linesOffset); TextLayoutBlock block = new TextLayoutBlock(); @@ -569,6 +581,9 @@ public class MessageObject { linesOffset += currentBlockLinesCount; } + if (blockHeight == 0) { + blockHeight = 1; + } } public boolean isOut() { diff --git a/TMessagesProj/src/main/java/org/telegram/android/MessagesController.java b/TMessagesProj/src/main/java/org/telegram/android/MessagesController.java index 457e10ec3..9d584f6f9 100644 --- a/TMessagesProj/src/main/java/org/telegram/android/MessagesController.java +++ b/TMessagesProj/src/main/java/org/telegram/android/MessagesController.java @@ -914,6 +914,9 @@ public class MessagesController implements NotificationCenter.NotificationCenter } public void deleteMessages(ArrayList messages, ArrayList randoms, TLRPC.EncryptedChat encryptedChat) { + if (messages == null) { + return; + } for (Integer id : messages) { MessageObject obj = dialogMessage.get(id); if (obj != null) { @@ -2209,7 +2212,7 @@ public class MessagesController implements NotificationCenter.NotificationCenter req.token = regid; req.app_sandbox = false; try { - req.lang_code = Locale.getDefault().getCountry(); + req.lang_code = LocaleController.getLocaleString(Locale.getDefault()); req.device_model = Build.MANUFACTURER + Build.MODEL; if (req.device_model == null) { req.device_model = "Android unknown"; @@ -3326,7 +3329,7 @@ public class MessagesController implements NotificationCenter.NotificationCenter public void run() { int updateMask = 0; if (!markAsReadMessages.isEmpty()) { - NotificationCenter.getInstance().postNotificationName(NotificationCenter.messagesReaded, markAsReadMessages); + NotificationCenter.getInstance().postNotificationName(NotificationCenter.messagesRead, markAsReadMessages); NotificationsController.getInstance().processReadMessages(markAsReadMessages, 0, 0, 0, false); for (Integer id : markAsReadMessages) { diff --git a/TMessagesProj/src/main/java/org/telegram/android/MessagesStorage.java b/TMessagesProj/src/main/java/org/telegram/android/MessagesStorage.java index 86f151365..47b0e3dfc 100644 --- a/TMessagesProj/src/main/java/org/telegram/android/MessagesStorage.java +++ b/TMessagesProj/src/main/java/org/telegram/android/MessagesStorage.java @@ -32,6 +32,7 @@ import org.telegram.ui.ApplicationLoader; import java.io.File; import java.util.ArrayList; import java.util.Collections; +import java.util.Comparator; import java.util.HashMap; import java.util.Locale; import java.util.Map; @@ -1947,6 +1948,56 @@ public class MessagesStorage { } } buffersStorage.reuseFreeBuffer(data); + + Collections.sort(res.messages, new Comparator() { + @Override + public int compare(TLRPC.Message lhs, TLRPC.Message rhs) { + if (lhs.id > 0 && rhs.id > 0) { + if (!forward) { + if (lhs.id > rhs.id) { + return -1; + } else if (lhs.id < rhs.id) { + return 1; + } + } else { + if (lhs.id < rhs.id) { + return -1; + } else if (lhs.id > rhs.id) { + return 1; + } + } + } else if (lhs.id < 0 && rhs.id < 0) { + if (!forward) { + if (lhs.id < rhs.id) { + return -1; + } else if (lhs.id > rhs.id) { + return 1; + } + } else { + if (lhs.id > rhs.id) { + return -1; + } else if (lhs.id < rhs.id) { + return 1; + } + } + } else { + if (!forward) { + if (lhs.date > rhs.date) { + return -1; + } else if (lhs.date < rhs.date) { + return 1; + } + } else { + if (lhs.date < rhs.date) { + return -1; + } else if (lhs.date > rhs.date) { + return 1; + } + } + } + return 0; + } + }); } cursor.dispose(); @@ -2261,9 +2312,6 @@ public class MessagesStorage { } private void putUsersAndChatsInternal(final ArrayList users, final ArrayList chats, final boolean withTransaction) { - if (Thread.currentThread().getId() != storageQueue.getId()) { - throw new RuntimeException("wrong db thread"); - } try { if (withTransaction) { database.beginTransaction(); diff --git a/TMessagesProj/src/main/java/org/telegram/android/NativeLoader.java b/TMessagesProj/src/main/java/org/telegram/android/NativeLoader.java index a161dc6a5..04e039b2d 100644 --- a/TMessagesProj/src/main/java/org/telegram/android/NativeLoader.java +++ b/TMessagesProj/src/main/java/org/telegram/android/NativeLoader.java @@ -24,9 +24,9 @@ import java.util.zip.ZipFile; public class NativeLoader { private static final long sizes[] = new long[] { - 803472, //armeabi - 856740, //armeabi-v7a - 1250356, //x86 + 811664, //armeabi + 864932, //armeabi-v7a + 1262644, //x86 0, //mips }; diff --git a/TMessagesProj/src/main/java/org/telegram/android/NotificationCenter.java b/TMessagesProj/src/main/java/org/telegram/android/NotificationCenter.java index 23eaa2f92..f45b7bb93 100644 --- a/TMessagesProj/src/main/java/org/telegram/android/NotificationCenter.java +++ b/TMessagesProj/src/main/java/org/telegram/android/NotificationCenter.java @@ -18,7 +18,7 @@ public class NotificationCenter { public static final int dialogsNeedReload = 4; public static final int closeChats = 5; public static final int messagesDeleted = 6; - public static final int messagesReaded = 7; + public static final int messagesRead = 7; public static final int messagesDidLoaded = 8; public static final int messageReceivedByAck = 9; public static final int messageReceivedByServer = 10; diff --git a/TMessagesProj/src/main/java/org/telegram/android/NotificationsController.java b/TMessagesProj/src/main/java/org/telegram/android/NotificationsController.java index 71b44953d..b5777a760 100644 --- a/TMessagesProj/src/main/java/org/telegram/android/NotificationsController.java +++ b/TMessagesProj/src/main/java/org/telegram/android/NotificationsController.java @@ -432,6 +432,11 @@ public class NotificationsController { } if (choosenSoundPath != null && !choosenSoundPath.equals("NoSound")) { if (choosenSoundPath.equals(defaultPath)) { + /*MediaPlayer mediaPlayer = new MediaPlayer(); + mediaPlayer.setAudioStreamType(AudioManager.STREAM_ALARM); + mediaPlayer.setDataSource(ApplicationLoader.applicationContext, Settings.System.DEFAULT_NOTIFICATION_URI); + mediaPlayer.prepare(); + mediaPlayer.start();*/ mBuilder.setSound(Settings.System.DEFAULT_NOTIFICATION_URI, AudioManager.STREAM_NOTIFICATION); } else { mBuilder.setSound(Uri.parse(choosenSoundPath), AudioManager.STREAM_NOTIFICATION); @@ -444,10 +449,10 @@ public class NotificationsController { mBuilder.setVibrate(new long[]{0, 0}); } else if (needVibrate == 1) { mBuilder.setVibrate(new long[]{0, 100, 0, 100}); - } else if (needVibrate == 0 || needVibrate == 5) { + } else if (needVibrate == 0 || needVibrate == 4) { mBuilder.setDefaults(NotificationCompat.DEFAULT_VIBRATE); } else if (needVibrate == 3) { - mBuilder.setVibrate(new long[]{0, 500}); + mBuilder.setVibrate(new long[]{0, 1000}); } } else { mBuilder.setVibrate(new long[]{0, 0}); @@ -693,9 +698,9 @@ public class NotificationsController { } if (total_unread_count == 0) { popupMessages.clear(); - showOrUpdateNotification(false); NotificationCenter.getInstance().postNotificationName(NotificationCenter.pushMessagesUpdated); } + showOrUpdateNotification(SystemClock.uptimeMillis() / 1000 < 60); if (preferences.getBoolean("badgeNumber", true)) { setBadge(ApplicationLoader.applicationContext, total_unread_count); diff --git a/TMessagesProj/src/main/java/org/telegram/android/PhotoObject.java b/TMessagesProj/src/main/java/org/telegram/android/PhotoObject.java index 28dca2689..eb079ef03 100644 --- a/TMessagesProj/src/main/java/org/telegram/android/PhotoObject.java +++ b/TMessagesProj/src/main/java/org/telegram/android/PhotoObject.java @@ -11,6 +11,7 @@ package org.telegram.android; import android.graphics.Bitmap; import android.graphics.BitmapFactory; +import org.telegram.messenger.FileLog; import org.telegram.messenger.TLRPC; import org.telegram.messenger.Utilities; @@ -29,14 +30,18 @@ public class PhotoObject { opts.inDither = false; opts.outWidth = photo.w; opts.outHeight = photo.h; - image = BitmapFactory.decodeByteArray(photoOwner.bytes, 0, photoOwner.bytes.length, opts); - if (image != null) { - if (preview == 2) { - Utilities.blurBitmap(image, image.getWidth(), image.getHeight(), image.getRowBytes()); - } - if (ImageLoader.getInstance().runtimeHack != null) { - ImageLoader.getInstance().runtimeHack.trackFree(image.getRowBytes() * image.getHeight()); + try { + image = BitmapFactory.decodeByteArray(photoOwner.bytes, 0, photoOwner.bytes.length, opts); + if (image != null) { + if (preview == 2) { + Utilities.blurBitmap(image); + } + if (ImageLoader.getInstance().runtimeHack != null) { + ImageLoader.getInstance().runtimeHack.trackFree(image.getRowBytes() * image.getHeight()); + } } + } catch (Throwable throwable) { + FileLog.e("tmessages", throwable); } } } diff --git a/TMessagesProj/src/main/java/org/telegram/android/SendMessagesHelper.java b/TMessagesProj/src/main/java/org/telegram/android/SendMessagesHelper.java index 68518aa41..762143016 100644 --- a/TMessagesProj/src/main/java/org/telegram/android/SendMessagesHelper.java +++ b/TMessagesProj/src/main/java/org/telegram/android/SendMessagesHelper.java @@ -12,6 +12,7 @@ import org.telegram.messenger.BuffersStorage; import org.telegram.messenger.ByteBufferDesc; import org.telegram.messenger.ConnectionsManager; import org.telegram.messenger.FileLoader; +import org.telegram.messenger.FileLog; import org.telegram.messenger.MessageKeyData; import org.telegram.messenger.RPCRequest; import org.telegram.messenger.TLObject; @@ -270,6 +271,7 @@ public class SendMessagesHelper implements NotificationCenter.NotificationCenter } private void sendMessage(String message, double lat, double lon, TLRPC.TL_photo photo, TLRPC.TL_video video, MessageObject msgObj, TLRPC.User user, TLRPC.TL_document document, TLRPC.TL_audio audio, String originalPath, long peer, boolean retry) { + TLRPC.Message newMsg = null; int type = -1; if (retry) { @@ -290,7 +292,7 @@ public class SendMessagesHelper implements NotificationCenter.NotificationCenter if (msgObj.messageOwner instanceof TLRPC.TL_messageForwarded) { type = 4; } else { - photo = (TLRPC.TL_photo)newMsg.media.photo; + photo = (TLRPC.TL_photo) newMsg.media.photo; type = 2; } } else if (msgObj.type == 3) { @@ -298,7 +300,7 @@ public class SendMessagesHelper implements NotificationCenter.NotificationCenter type = 4; } else { type = 3; - video = (TLRPC.TL_video)newMsg.media.video; + video = (TLRPC.TL_video) newMsg.media.video; video.path = newMsg.attachPath; } } else if (msgObj.type == 12 || msgObj.type == 13) { @@ -309,11 +311,11 @@ public class SendMessagesHelper implements NotificationCenter.NotificationCenter user.id = newMsg.media.user_id; type = 6; } else if (msgObj.type == 8 || msgObj.type == 9) { - document = (TLRPC.TL_document)newMsg.media.document; + document = (TLRPC.TL_document) newMsg.media.document; document.path = newMsg.attachPath; type = 7; } else if (msgObj.type == 2) { - audio = (TLRPC.TL_audio)newMsg.media.audio; + audio = (TLRPC.TL_audio) newMsg.media.audio; audio.path = newMsg.attachPath; type = 8; } @@ -400,8 +402,8 @@ public class SendMessagesHelper implements NotificationCenter.NotificationCenter } newMsg.unread = true; newMsg.dialog_id = peer; - int lower_id = (int)peer; - int high_id = (int)(peer >> 32); + int lower_id = (int) peer; + int high_id = (int) (peer >> 32); TLRPC.EncryptedChat encryptedChat = null; TLRPC.InputPeer sendToPeer = null; ArrayList sendToPeers = null; @@ -467,297 +469,304 @@ public class SendMessagesHelper implements NotificationCenter.NotificationCenter MessagesController.getInstance().updateInterfaceWithMessages(peer, objArr); NotificationCenter.getInstance().postNotificationName(NotificationCenter.dialogsNeedReload); - if (type == 0) { - if (encryptedChat == null) { - if (sendToPeers != null) { - TLRPC.TL_messages_sendBroadcast reqSend = new TLRPC.TL_messages_sendBroadcast(); - reqSend.message = message; - reqSend.contacts = sendToPeers; - reqSend.media = new TLRPC.TL_inputMediaEmpty(); - performSendMessageRequest(reqSend, newMsgObj, null); + try { + if (type == 0) { + if (encryptedChat == null) { + if (sendToPeers != null) { + TLRPC.TL_messages_sendBroadcast reqSend = new TLRPC.TL_messages_sendBroadcast(); + reqSend.message = message; + reqSend.contacts = sendToPeers; + reqSend.media = new TLRPC.TL_inputMediaEmpty(); + performSendMessageRequest(reqSend, newMsgObj, null); + } else { + TLRPC.TL_messages_sendMessage reqSend = new TLRPC.TL_messages_sendMessage(); + reqSend.message = message; + reqSend.peer = sendToPeer; + reqSend.random_id = newMsg.random_id; + performSendMessageRequest(reqSend, newMsgObj, null); + } } else { - TLRPC.TL_messages_sendMessage reqSend = new TLRPC.TL_messages_sendMessage(); - reqSend.message = message; - reqSend.peer = sendToPeer; + TLRPC.TL_decryptedMessage reqSend = new TLRPC.TL_decryptedMessage(); reqSend.random_id = newMsg.random_id; - performSendMessageRequest(reqSend, newMsgObj, null); + reqSend.random_bytes = new byte[Math.max(1, (int) Math.ceil(Utilities.random.nextDouble() * 16))]; + Utilities.random.nextBytes(reqSend.random_bytes); + reqSend.message = message; + reqSend.media = new TLRPC.TL_decryptedMessageMediaEmpty(); + performSendEncryptedRequest(reqSend, newMsgObj, encryptedChat, null, null); } - } else { - TLRPC.TL_decryptedMessage reqSend = new TLRPC.TL_decryptedMessage(); - reqSend.random_id = newMsg.random_id; - reqSend.random_bytes = new byte[Math.max(1, (int)Math.ceil(Utilities.random.nextDouble() * 16))]; - Utilities.random.nextBytes(reqSend.random_bytes); - reqSend.message = message; - reqSend.media = new TLRPC.TL_decryptedMessageMediaEmpty(); - performSendEncryptedRequest(reqSend, newMsgObj, encryptedChat, null, null); - } - } else if (type >= 1 && type <= 3 || type >= 5 && type <= 8) { - if (encryptedChat == null) { - TLRPC.InputMedia inputMedia = null; - DelayedMessage delayedMessage = null; - if (type == 1) { - inputMedia = new TLRPC.TL_inputMediaGeoPoint(); - inputMedia.geo_point = new TLRPC.TL_inputGeoPoint(); - inputMedia.geo_point.lat = lat; - inputMedia.geo_point._long = lon; - } else if (type == 2) { - if (photo.access_hash == 0) { - inputMedia = new TLRPC.TL_inputMediaUploadedPhoto(); - delayedMessage = new DelayedMessage(); - delayedMessage.originalPath = originalPath; - delayedMessage.type = 0; - delayedMessage.obj = newMsgObj; - delayedMessage.location = photo.sizes.get(photo.sizes.size() - 1).location; - } else { - TLRPC.TL_inputMediaPhoto media = new TLRPC.TL_inputMediaPhoto(); - media.id = new TLRPC.TL_inputPhoto(); - media.id.id = photo.id; - media.id.access_hash = photo.access_hash; - inputMedia = media; - } - } else if (type == 3) { - if (video.access_hash == 0) { - inputMedia = new TLRPC.TL_inputMediaUploadedThumbVideo(); - inputMedia.duration = video.duration; - inputMedia.w = video.w; - inputMedia.h = video.h; - inputMedia.mime_type = video.mime_type; - delayedMessage = new DelayedMessage(); - delayedMessage.originalPath = originalPath; - delayedMessage.type = 1; - delayedMessage.obj = newMsgObj; - delayedMessage.location = video.thumb.location; - delayedMessage.videoLocation = video; - } else { - TLRPC.TL_inputMediaVideo media = new TLRPC.TL_inputMediaVideo(); - media.id = new TLRPC.TL_inputVideo(); - media.id.id = video.id; - media.id.access_hash = video.access_hash; - inputMedia = media; - } - } else if (type == 6) { - inputMedia = new TLRPC.TL_inputMediaContact(); - inputMedia.phone_number = user.phone; - inputMedia.first_name = user.first_name; - inputMedia.last_name = user.last_name; - } else if (type == 7) { - if (document.access_hash == 0) { - if (document.thumb.location != null && document.thumb.location instanceof TLRPC.TL_fileLocation) { - inputMedia = new TLRPC.TL_inputMediaUploadedThumbDocument(); + } else if (type >= 1 && type <= 3 || type >= 5 && type <= 8) { + if (encryptedChat == null) { + TLRPC.InputMedia inputMedia = null; + DelayedMessage delayedMessage = null; + if (type == 1) { + inputMedia = new TLRPC.TL_inputMediaGeoPoint(); + inputMedia.geo_point = new TLRPC.TL_inputGeoPoint(); + inputMedia.geo_point.lat = lat; + inputMedia.geo_point._long = lon; + } else if (type == 2) { + if (photo.access_hash == 0) { + inputMedia = new TLRPC.TL_inputMediaUploadedPhoto(); + delayedMessage = new DelayedMessage(); + delayedMessage.originalPath = originalPath; + delayedMessage.type = 0; + delayedMessage.obj = newMsgObj; + delayedMessage.location = photo.sizes.get(photo.sizes.size() - 1).location; } else { - inputMedia = new TLRPC.TL_inputMediaUploadedDocument(); + TLRPC.TL_inputMediaPhoto media = new TLRPC.TL_inputMediaPhoto(); + media.id = new TLRPC.TL_inputPhoto(); + media.id.id = photo.id; + media.id.access_hash = photo.access_hash; + inputMedia = media; + } + } else if (type == 3) { + if (video.access_hash == 0) { + inputMedia = new TLRPC.TL_inputMediaUploadedThumbVideo(); + inputMedia.duration = video.duration; + inputMedia.w = video.w; + inputMedia.h = video.h; + inputMedia.mime_type = video.mime_type; + delayedMessage = new DelayedMessage(); + delayedMessage.originalPath = originalPath; + delayedMessage.type = 1; + delayedMessage.obj = newMsgObj; + delayedMessage.location = video.thumb.location; + delayedMessage.videoLocation = video; + } else { + TLRPC.TL_inputMediaVideo media = new TLRPC.TL_inputMediaVideo(); + media.id = new TLRPC.TL_inputVideo(); + media.id.id = video.id; + media.id.access_hash = video.access_hash; + inputMedia = media; + } + } else if (type == 6) { + inputMedia = new TLRPC.TL_inputMediaContact(); + inputMedia.phone_number = user.phone; + inputMedia.first_name = user.first_name; + inputMedia.last_name = user.last_name; + } else if (type == 7) { + if (document.access_hash == 0) { + if (document.thumb.location != null && document.thumb.location instanceof TLRPC.TL_fileLocation) { + inputMedia = new TLRPC.TL_inputMediaUploadedThumbDocument(); + } else { + inputMedia = new TLRPC.TL_inputMediaUploadedDocument(); + } + inputMedia.mime_type = document.mime_type; + inputMedia.file_name = document.file_name; + delayedMessage = new DelayedMessage(); + delayedMessage.originalPath = originalPath; + delayedMessage.type = 2; + delayedMessage.obj = newMsgObj; + delayedMessage.documentLocation = document; + delayedMessage.location = document.thumb.location; + } else { + TLRPC.TL_inputMediaDocument media = new TLRPC.TL_inputMediaDocument(); + media.id = new TLRPC.TL_inputDocument(); + media.id.id = document.id; + media.id.access_hash = document.access_hash; + inputMedia = media; + } + } else if (type == 8) { + if (audio.access_hash == 0) { + inputMedia = new TLRPC.TL_inputMediaUploadedAudio(); + inputMedia.duration = audio.duration; + inputMedia.mime_type = audio.mime_type; + delayedMessage = new DelayedMessage(); + delayedMessage.type = 3; + delayedMessage.obj = newMsgObj; + delayedMessage.audioLocation = audio; + } else { + TLRPC.TL_inputMediaAudio media = new TLRPC.TL_inputMediaAudio(); + media.id = new TLRPC.TL_inputAudio(); + media.id.id = audio.id; + media.id.access_hash = audio.access_hash; + inputMedia = media; } - inputMedia.mime_type = document.mime_type; - inputMedia.file_name = document.file_name; - delayedMessage = new DelayedMessage(); - delayedMessage.originalPath = originalPath; - delayedMessage.type = 2; - delayedMessage.obj = newMsgObj; - delayedMessage.documentLocation = document; - delayedMessage.location = document.thumb.location; - } else { - TLRPC.TL_inputMediaDocument media = new TLRPC.TL_inputMediaDocument(); - media.id = new TLRPC.TL_inputDocument(); - media.id.id = document.id; - media.id.access_hash = document.access_hash; - inputMedia = media; } - } else if (type == 8) { - if (audio.access_hash == 0) { - inputMedia = new TLRPC.TL_inputMediaUploadedAudio(); - inputMedia.duration = audio.duration; - inputMedia.mime_type = audio.mime_type; - delayedMessage = new DelayedMessage(); + + TLObject reqSend = null; + + if (sendToPeers != null) { + TLRPC.TL_messages_sendBroadcast request = new TLRPC.TL_messages_sendBroadcast(); + request.contacts = sendToPeers; + request.media = inputMedia; + request.message = ""; + if (delayedMessage != null) { + delayedMessage.sendRequest = request; + } + reqSend = request; + } else { + TLRPC.TL_messages_sendMedia request = new TLRPC.TL_messages_sendMedia(); + request.peer = sendToPeer; + request.random_id = newMsg.random_id; + request.media = inputMedia; + if (delayedMessage != null) { + delayedMessage.sendRequest = request; + } + reqSend = request; + } + if (type == 1) { + performSendMessageRequest(reqSend, newMsgObj, null); + } else if (type == 2) { + if (photo.access_hash == 0) { + performSendDelayedMessage(delayedMessage); + } else { + performSendMessageRequest(reqSend, newMsgObj, null); + } + } else if (type == 3) { + if (video.access_hash == 0) { + performSendDelayedMessage(delayedMessage); + } else { + performSendMessageRequest(reqSend, newMsgObj, null); + } + } else if (type == 6) { + performSendMessageRequest(reqSend, newMsgObj, null); + } else if (type == 7) { + if (document.access_hash == 0) { + performSendDelayedMessage(delayedMessage); + } else { + performSendMessageRequest(reqSend, newMsgObj, null); + } + } else if (type == 8) { + if (audio.access_hash == 0) { + performSendDelayedMessage(delayedMessage); + } else { + performSendMessageRequest(reqSend, newMsgObj, null); + } + } + } else { + TLRPC.TL_decryptedMessage reqSend = new TLRPC.TL_decryptedMessage(); + reqSend.random_id = newMsg.random_id; + reqSend.random_bytes = new byte[Math.max(1, (int) Math.ceil(Utilities.random.nextDouble() * 16))]; + Utilities.random.nextBytes(reqSend.random_bytes); + reqSend.message = ""; + if (type == 1) { + reqSend.media = new TLRPC.TL_decryptedMessageMediaGeoPoint(); + reqSend.media.lat = lat; + reqSend.media._long = lon; + performSendEncryptedRequest(reqSend, newMsgObj, encryptedChat, null, null); + } else if (type == 2) { + TLRPC.PhotoSize small = photo.sizes.get(0); + TLRPC.PhotoSize big = photo.sizes.get(photo.sizes.size() - 1); + reqSend.media = new TLRPC.TL_decryptedMessageMediaPhoto(); + reqSend.media.thumb = small.bytes; + reqSend.media.thumb_h = small.h; + reqSend.media.thumb_w = small.w; + reqSend.media.w = big.w; + reqSend.media.h = big.h; + reqSend.media.size = big.size; + if (big.location.key == null) { + DelayedMessage delayedMessage = new DelayedMessage(); + delayedMessage.originalPath = originalPath; + delayedMessage.sendEncryptedRequest = reqSend; + delayedMessage.type = 0; + delayedMessage.obj = newMsgObj; + delayedMessage.encryptedChat = encryptedChat; + delayedMessage.location = photo.sizes.get(photo.sizes.size() - 1).location; + performSendDelayedMessage(delayedMessage); + } else { + TLRPC.TL_inputEncryptedFile encryptedFile = new TLRPC.TL_inputEncryptedFile(); + encryptedFile.id = big.location.volume_id; + encryptedFile.access_hash = big.location.secret; + reqSend.media.key = big.location.key; + reqSend.media.iv = big.location.iv; + performSendEncryptedRequest(reqSend, newMsgObj, encryptedChat, encryptedFile, null); + } + } else if (type == 3) { + reqSend.media = new TLRPC.TL_decryptedMessageMediaVideo_old(); + reqSend.media.duration = video.duration; + reqSend.media.size = video.size; + reqSend.media.w = video.w; + reqSend.media.h = video.h; + reqSend.media.thumb = video.thumb.bytes; + reqSend.media.thumb_h = video.thumb.h; + reqSend.media.thumb_w = video.thumb.w; + reqSend.media.mime_type = "video/mp4"; + if (video.access_hash == 0) { + DelayedMessage delayedMessage = new DelayedMessage(); + delayedMessage.originalPath = originalPath; + delayedMessage.sendEncryptedRequest = reqSend; + delayedMessage.type = 1; + delayedMessage.obj = newMsgObj; + delayedMessage.encryptedChat = encryptedChat; + delayedMessage.videoLocation = video; + performSendDelayedMessage(delayedMessage); + } else { + TLRPC.TL_inputEncryptedFile encryptedFile = new TLRPC.TL_inputEncryptedFile(); + encryptedFile.id = video.id; + encryptedFile.access_hash = video.access_hash; + reqSend.media.key = video.key; + reqSend.media.iv = video.iv; + performSendEncryptedRequest(reqSend, newMsgObj, encryptedChat, encryptedFile, null); + } + } else if (type == 6) { + reqSend.media = new TLRPC.TL_decryptedMessageMediaContact(); + reqSend.media.phone_number = user.phone; + reqSend.media.first_name = user.first_name; + reqSend.media.last_name = user.last_name; + reqSend.media.user_id = user.id; + performSendEncryptedRequest(reqSend, newMsgObj, encryptedChat, null, null); + } else if (type == 7) { + reqSend.media = new TLRPC.TL_decryptedMessageMediaDocument(); + reqSend.media.size = document.size; + if (!(document.thumb instanceof TLRPC.TL_photoSizeEmpty)) { + reqSend.media.thumb = document.thumb.bytes; + reqSend.media.thumb_h = document.thumb.h; + reqSend.media.thumb_w = document.thumb.w; + } else { + reqSend.media.thumb = new byte[0]; + reqSend.media.thumb_h = 0; + reqSend.media.thumb_w = 0; + } + reqSend.media.file_name = document.file_name; + reqSend.media.mime_type = document.mime_type; + if (document.access_hash == 0) { + DelayedMessage delayedMessage = new DelayedMessage(); + delayedMessage.originalPath = originalPath; + delayedMessage.sendEncryptedRequest = reqSend; + delayedMessage.type = 2; + delayedMessage.obj = newMsgObj; + delayedMessage.encryptedChat = encryptedChat; + delayedMessage.documentLocation = document; + performSendDelayedMessage(delayedMessage); + } else { + TLRPC.TL_inputEncryptedFile encryptedFile = new TLRPC.TL_inputEncryptedFile(); + encryptedFile.id = document.id; + encryptedFile.access_hash = document.access_hash; + reqSend.media.key = document.key; + reqSend.media.iv = document.iv; + performSendEncryptedRequest(reqSend, newMsgObj, encryptedChat, encryptedFile, null); + } + } else if (type == 8) { + reqSend.media = new TLRPC.TL_decryptedMessageMediaAudio_old(); + reqSend.media.duration = audio.duration; + reqSend.media.size = audio.size; + reqSend.media.mime_type = "audio/ogg"; + + DelayedMessage delayedMessage = new DelayedMessage(); + delayedMessage.sendEncryptedRequest = reqSend; delayedMessage.type = 3; delayedMessage.obj = newMsgObj; + delayedMessage.encryptedChat = encryptedChat; delayedMessage.audioLocation = audio; - } else { - TLRPC.TL_inputMediaAudio media = new TLRPC.TL_inputMediaAudio(); - media.id = new TLRPC.TL_inputAudio(); - media.id.id = audio.id; - media.id.access_hash = audio.access_hash; - inputMedia = media; + performSendDelayedMessage(delayedMessage); } } - - TLObject reqSend = null; - - if (sendToPeers != null) { - TLRPC.TL_messages_sendBroadcast request = new TLRPC.TL_messages_sendBroadcast(); - request.contacts = sendToPeers; - request.media = inputMedia; - request.message = ""; - if (delayedMessage != null) { - delayedMessage.sendRequest = request; - } - reqSend = request; - } else { - TLRPC.TL_messages_sendMedia request = new TLRPC.TL_messages_sendMedia(); - request.peer = sendToPeer; - request.random_id = newMsg.random_id; - request.media = inputMedia; - if (delayedMessage != null) { - delayedMessage.sendRequest = request; - } - reqSend = request; - } - if (type == 1) { - performSendMessageRequest(reqSend, newMsgObj, null); - } else if (type == 2) { - if (photo.access_hash == 0) { - performSendDelayedMessage(delayedMessage); - } else { - performSendMessageRequest(reqSend, newMsgObj, null); - } - } else if (type == 3) { - if (video.access_hash == 0) { - performSendDelayedMessage(delayedMessage); - } else { - performSendMessageRequest(reqSend, newMsgObj, null); - } - } else if (type == 6) { - performSendMessageRequest(reqSend, newMsgObj, null); - } else if (type == 7) { - if (document.access_hash == 0) { - performSendDelayedMessage(delayedMessage); - } else { - performSendMessageRequest(reqSend, newMsgObj, null); - } - } else if (type == 8) { - if (audio.access_hash == 0) { - performSendDelayedMessage(delayedMessage); - } else { - performSendMessageRequest(reqSend, newMsgObj, null); - } - } - } else { - TLRPC.TL_decryptedMessage reqSend = new TLRPC.TL_decryptedMessage(); + } else if (type == 4) { + TLRPC.TL_messages_forwardMessage reqSend = new TLRPC.TL_messages_forwardMessage(); + reqSend.peer = sendToPeer; reqSend.random_id = newMsg.random_id; - reqSend.random_bytes = new byte[Math.max(1, (int)Math.ceil(Utilities.random.nextDouble() * 16))]; - Utilities.random.nextBytes(reqSend.random_bytes); - reqSend.message = ""; - if (type == 1) { - reqSend.media = new TLRPC.TL_decryptedMessageMediaGeoPoint(); - reqSend.media.lat = lat; - reqSend.media._long = lon; - performSendEncryptedRequest(reqSend, newMsgObj, encryptedChat, null, null); - } else if (type == 2) { - TLRPC.PhotoSize small = photo.sizes.get(0); - TLRPC.PhotoSize big = photo.sizes.get(photo.sizes.size() - 1); - reqSend.media = new TLRPC.TL_decryptedMessageMediaPhoto(); - reqSend.media.thumb = small.bytes; - reqSend.media.thumb_h = small.h; - reqSend.media.thumb_w = small.w; - reqSend.media.w = big.w; - reqSend.media.h = big.h; - reqSend.media.size = big.size; - if (big.location.key == null) { - DelayedMessage delayedMessage = new DelayedMessage(); - delayedMessage.originalPath = originalPath; - delayedMessage.sendEncryptedRequest = reqSend; - delayedMessage.type = 0; - delayedMessage.obj = newMsgObj; - delayedMessage.encryptedChat = encryptedChat; - delayedMessage.location = photo.sizes.get(photo.sizes.size() - 1).location; - performSendDelayedMessage(delayedMessage); - } else { - TLRPC.TL_inputEncryptedFile encryptedFile = new TLRPC.TL_inputEncryptedFile(); - encryptedFile.id = big.location.volume_id; - encryptedFile.access_hash = big.location.secret; - reqSend.media.key = big.location.key; - reqSend.media.iv = big.location.iv; - performSendEncryptedRequest(reqSend, newMsgObj, encryptedChat, encryptedFile, null); - } - } else if (type == 3) { - reqSend.media = new TLRPC.TL_decryptedMessageMediaVideo_old(); - reqSend.media.duration = video.duration; - reqSend.media.size = video.size; - reqSend.media.w = video.w; - reqSend.media.h = video.h; - reqSend.media.thumb = video.thumb.bytes; - reqSend.media.thumb_h = video.thumb.h; - reqSend.media.thumb_w = video.thumb.w; - reqSend.media.mime_type = "video/mp4"; - if (video.access_hash == 0) { - DelayedMessage delayedMessage = new DelayedMessage(); - delayedMessage.originalPath = originalPath; - delayedMessage.sendEncryptedRequest = reqSend; - delayedMessage.type = 1; - delayedMessage.obj = newMsgObj; - delayedMessage.encryptedChat = encryptedChat; - delayedMessage.videoLocation = video; - performSendDelayedMessage(delayedMessage); - } else { - TLRPC.TL_inputEncryptedFile encryptedFile = new TLRPC.TL_inputEncryptedFile(); - encryptedFile.id = video.id; - encryptedFile.access_hash = video.access_hash; - reqSend.media.key = video.key; - reqSend.media.iv = video.iv; - performSendEncryptedRequest(reqSend, newMsgObj, encryptedChat, encryptedFile, null); - } - } else if (type == 6) { - reqSend.media = new TLRPC.TL_decryptedMessageMediaContact(); - reqSend.media.phone_number = user.phone; - reqSend.media.first_name = user.first_name; - reqSend.media.last_name = user.last_name; - reqSend.media.user_id = user.id; - performSendEncryptedRequest(reqSend, newMsgObj, encryptedChat, null, null); - } else if (type == 7) { - reqSend.media = new TLRPC.TL_decryptedMessageMediaDocument(); - reqSend.media.size = document.size; - if (!(document.thumb instanceof TLRPC.TL_photoSizeEmpty)) { - reqSend.media.thumb = document.thumb.bytes; - reqSend.media.thumb_h = document.thumb.h; - reqSend.media.thumb_w = document.thumb.w; - } else { - reqSend.media.thumb = new byte[0]; - reqSend.media.thumb_h = 0; - reqSend.media.thumb_w = 0; - } - reqSend.media.file_name = document.file_name; - reqSend.media.mime_type = document.mime_type; - if (document.access_hash == 0) { - DelayedMessage delayedMessage = new DelayedMessage(); - delayedMessage.originalPath = originalPath; - delayedMessage.sendEncryptedRequest = reqSend; - delayedMessage.type = 2; - delayedMessage.obj = newMsgObj; - delayedMessage.encryptedChat = encryptedChat; - delayedMessage.documentLocation = document; - performSendDelayedMessage(delayedMessage); - } else { - TLRPC.TL_inputEncryptedFile encryptedFile = new TLRPC.TL_inputEncryptedFile(); - encryptedFile.id = document.id; - encryptedFile.access_hash = document.access_hash; - reqSend.media.key = document.key; - reqSend.media.iv = document.iv; - performSendEncryptedRequest(reqSend, newMsgObj, encryptedChat, encryptedFile, null); - } - } else if (type == 8) { - reqSend.media = new TLRPC.TL_decryptedMessageMediaAudio_old(); - reqSend.media.duration = audio.duration; - reqSend.media.size = audio.size; - reqSend.media.mime_type = "audio/ogg"; - - DelayedMessage delayedMessage = new DelayedMessage(); - delayedMessage.sendEncryptedRequest = reqSend; - delayedMessage.type = 3; - delayedMessage.obj = newMsgObj; - delayedMessage.encryptedChat = encryptedChat; - delayedMessage.audioLocation = audio; - performSendDelayedMessage(delayedMessage); + if (msgObj.messageOwner.id >= 0) { + reqSend.id = msgObj.messageOwner.id; + } else { + reqSend.id = msgObj.messageOwner.fwd_msg_id; } + performSendMessageRequest(reqSend, newMsgObj, null); } - } else if (type == 4) { - TLRPC.TL_messages_forwardMessage reqSend = new TLRPC.TL_messages_forwardMessage(); - reqSend.peer = sendToPeer; - reqSend.random_id = newMsg.random_id; - if (msgObj.messageOwner.id >= 0) { - reqSend.id = msgObj.messageOwner.id; - } else { - reqSend.id = msgObj.messageOwner.fwd_msg_id; - } - performSendMessageRequest(reqSend, newMsgObj, null); + } catch (Exception e) { + FileLog.e("tmessages", e); + MessagesStorage.getInstance().markMessageAsSendError(newMsgObj.messageOwner.id); + newMsgObj.messageOwner.send_state = MessageObject.MESSAGE_SEND_STATE_SEND_ERROR; + NotificationCenter.getInstance().postNotificationName(NotificationCenter.messageSendError, newMsgObj.messageOwner.id); } } @@ -788,7 +797,11 @@ public class SendMessagesHelper implements NotificationCenter.NotificationCenter location = AndroidUtilities.getCacheDir() + "/" + message.videoLocation.id + ".mp4"; } putToDelayedMessages(location, message); - FileLoader.getInstance().uploadFile(location, false, false); + if (message.videoLocation.estimatedSize) { + FileLoader.getInstance().uploadFile(location, false, false, message.videoLocation.size); + } else { + FileLoader.getInstance().uploadFile(location, false, false); + } } } else { String location = message.videoLocation.path; @@ -796,7 +809,11 @@ public class SendMessagesHelper implements NotificationCenter.NotificationCenter location = AndroidUtilities.getCacheDir() + "/" + message.videoLocation.id + ".mp4"; } putToDelayedMessages(location, message); - FileLoader.getInstance().uploadFile(location, true, false); + if (message.videoLocation.estimatedSize) { + FileLoader.getInstance().uploadFile(location, true, false, message.videoLocation.size); + } else { + FileLoader.getInstance().uploadFile(location, true, false); + } } } else if (message.type == 2) { TLRPC.InputMedia media = null; @@ -841,6 +858,7 @@ public class SendMessagesHelper implements NotificationCenter.NotificationCenter if (response instanceof TLRPC.TL_messages_sentMessage) { TLRPC.TL_messages_sentMessage res = (TLRPC.TL_messages_sentMessage) response; newMsgObj.messageOwner.id = res.id; + newMsgObj.messageOwner.date = res.date; MessagesController.getInstance().processNewDifferenceParams(res.seq, res.pts, res.date); } else if (response instanceof TLRPC.messages_StatedMessage) { TLRPC.messages_StatedMessage res = (TLRPC.messages_StatedMessage) response; diff --git a/TMessagesProj/src/main/java/org/telegram/android/video/InputSurface.java b/TMessagesProj/src/main/java/org/telegram/android/video/InputSurface.java new file mode 100644 index 000000000..914e8be6d --- /dev/null +++ b/TMessagesProj/src/main/java/org/telegram/android/video/InputSurface.java @@ -0,0 +1,135 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.telegram.android.video; + +import android.annotation.TargetApi; +import android.opengl.EGL14; +import android.opengl.EGLExt; +import android.opengl.EGLConfig; +import android.opengl.EGLContext; +import android.opengl.EGLDisplay; +import android.opengl.EGLSurface; +import android.view.Surface; + +@TargetApi(17) +public class InputSurface { + private static final boolean VERBOSE = false; + private static final int EGL_RECORDABLE_ANDROID = 0x3142; + private static final int EGL_OPENGL_ES2_BIT = 4; + private EGLDisplay mEGLDisplay; + private EGLContext mEGLContext; + private EGLSurface mEGLSurface; + private Surface mSurface; + + public InputSurface(Surface surface) { + if (surface == null) { + throw new NullPointerException(); + } + mSurface = surface; + eglSetup(); + } + + private void eglSetup() { + mEGLDisplay = EGL14.eglGetDisplay(EGL14.EGL_DEFAULT_DISPLAY); + if (mEGLDisplay == EGL14.EGL_NO_DISPLAY) { + throw new RuntimeException("unable to get EGL14 display"); + } + int[] version = new int[2]; + if (!EGL14.eglInitialize(mEGLDisplay, version, 0, version, 1)) { + mEGLDisplay = null; + throw new RuntimeException("unable to initialize EGL14"); + } + + int[] attribList = { + EGL14.EGL_RED_SIZE, 8, + EGL14.EGL_GREEN_SIZE, 8, + EGL14.EGL_BLUE_SIZE, 8, + EGL14.EGL_RENDERABLE_TYPE, EGL_OPENGL_ES2_BIT, + EGL_RECORDABLE_ANDROID, 1, + EGL14.EGL_NONE + }; + EGLConfig[] configs = new EGLConfig[1]; + int[] numConfigs = new int[1]; + if (!EGL14.eglChooseConfig(mEGLDisplay, attribList, 0, configs, 0, configs.length, + numConfigs, 0)) { + throw new RuntimeException("unable to find RGB888+recordable ES2 EGL config"); + } + + int[] attrib_list = { + EGL14.EGL_CONTEXT_CLIENT_VERSION, 2, + EGL14.EGL_NONE + }; + + mEGLContext = EGL14.eglCreateContext(mEGLDisplay, configs[0], EGL14.EGL_NO_CONTEXT, attrib_list, 0); + checkEglError("eglCreateContext"); + if (mEGLContext == null) { + throw new RuntimeException("null context"); + } + + int[] surfaceAttribs = { + EGL14.EGL_NONE + }; + mEGLSurface = EGL14.eglCreateWindowSurface(mEGLDisplay, configs[0], mSurface, + surfaceAttribs, 0); + checkEglError("eglCreateWindowSurface"); + if (mEGLSurface == null) { + throw new RuntimeException("surface was null"); + } + } + + public void release() { + if (EGL14.eglGetCurrentContext().equals(mEGLContext)) { + EGL14.eglMakeCurrent(mEGLDisplay, EGL14.EGL_NO_SURFACE, EGL14.EGL_NO_SURFACE, EGL14.EGL_NO_CONTEXT); + } + EGL14.eglDestroySurface(mEGLDisplay, mEGLSurface); + EGL14.eglDestroyContext(mEGLDisplay, mEGLContext); + mSurface.release(); + mEGLDisplay = null; + mEGLContext = null; + mEGLSurface = null; + mSurface = null; + } + + public void makeCurrent() { + if (!EGL14.eglMakeCurrent(mEGLDisplay, mEGLSurface, mEGLSurface, mEGLContext)) { + throw new RuntimeException("eglMakeCurrent failed"); + } + } + + public boolean swapBuffers() { + return EGL14.eglSwapBuffers(mEGLDisplay, mEGLSurface); + } + + public Surface getSurface() { + return mSurface; + } + + public void setPresentationTime(long nsecs) { + EGLExt.eglPresentationTimeANDROID(mEGLDisplay, mEGLSurface, nsecs); + } + + private void checkEglError(String msg) { + boolean failed = false; + int error; + while ((error = EGL14.eglGetError()) != EGL14.EGL_SUCCESS) { + failed = true; + } + if (failed) { + throw new RuntimeException("EGL error encountered (see log)"); + } + } +} diff --git a/TMessagesProj/src/main/java/org/telegram/android/video/MP4Builder.java b/TMessagesProj/src/main/java/org/telegram/android/video/MP4Builder.java new file mode 100644 index 000000000..fb8d70102 --- /dev/null +++ b/TMessagesProj/src/main/java/org/telegram/android/video/MP4Builder.java @@ -0,0 +1,430 @@ +/* + * This is the source code of Telegram for Android v. 1.7.x. + * It is licensed under GNU GPL v. 2 or later. + * You should have received a copy of the license in this archive (see LICENSE). + * + * Copyright Nikolai Kudashov, 2013-2014. + */ + +package org.telegram.android.video; + +import android.annotation.TargetApi; +import android.media.MediaCodec; +import android.media.MediaFormat; + +import com.coremedia.iso.BoxParser; +import com.coremedia.iso.IsoFile; +import com.coremedia.iso.IsoTypeWriter; +import com.coremedia.iso.boxes.Box; +import com.coremedia.iso.boxes.Container; +import com.coremedia.iso.boxes.DataEntryUrlBox; +import com.coremedia.iso.boxes.DataInformationBox; +import com.coremedia.iso.boxes.DataReferenceBox; +import com.coremedia.iso.boxes.FileTypeBox; +import com.coremedia.iso.boxes.HandlerBox; +import com.coremedia.iso.boxes.MediaBox; +import com.coremedia.iso.boxes.MediaHeaderBox; +import com.coremedia.iso.boxes.MediaInformationBox; +import com.coremedia.iso.boxes.MovieBox; +import com.coremedia.iso.boxes.MovieHeaderBox; +import com.coremedia.iso.boxes.SampleSizeBox; +import com.coremedia.iso.boxes.SampleTableBox; +import com.coremedia.iso.boxes.SampleToChunkBox; +import com.coremedia.iso.boxes.StaticChunkOffsetBox; +import com.coremedia.iso.boxes.SyncSampleBox; +import com.coremedia.iso.boxes.TimeToSampleBox; +import com.coremedia.iso.boxes.TrackBox; +import com.coremedia.iso.boxes.TrackHeaderBox; +import com.googlecode.mp4parser.DataSource; +import com.googlecode.mp4parser.util.Matrix; + +import java.io.FileOutputStream; +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.channels.FileChannel; +import java.nio.channels.WritableByteChannel; +import java.util.ArrayList; +import java.util.Date; +import java.util.HashMap; +import java.util.LinkedList; +import java.util.List; + +@TargetApi(16) +public class MP4Builder { + + private InterleaveChunkMdat mdat = null; + private Mp4Movie currentMp4Movie = null; + FileOutputStream fos = null; + private FileChannel fc = null; + private long dataOffset = 0; + private long writedSinceLastMdat = 0; + private boolean writeNewMdat = true; + HashMap track2SampleSizes = new HashMap(); + + public MP4Builder createMovie(Mp4Movie mp4Movie) throws Exception { + currentMp4Movie = mp4Movie; + + fos = new FileOutputStream(mp4Movie.getCacheFile()); + fc = fos.getChannel(); + + FileTypeBox fileTypeBox = createFileTypeBox(); + fileTypeBox.getBox(fc); + dataOffset += fileTypeBox.getSize(); + writedSinceLastMdat += dataOffset; + + mdat = new InterleaveChunkMdat(); + + return this; + } + + private void flushCurrentMdat() throws Exception { + long oldPosition = fc.position(); + fc.position(mdat.getOffset()); + mdat.getBox(fc); + fc.position(oldPosition); + mdat.setDataOffset(0); + mdat.setContentSize(0); + fos.flush(); + } + + public void writeSampleData(int trackIndex, ByteBuffer byteBuf, MediaCodec.BufferInfo bufferInfo) throws Exception { + if (writeNewMdat) { + mdat.setContentSize(0); + mdat.getBox(fc); + mdat.setDataOffset(dataOffset); + dataOffset += 16; + writedSinceLastMdat += 16; + writeNewMdat = false; + } + + mdat.setContentSize(mdat.getContentSize() + bufferInfo.size); + writedSinceLastMdat += bufferInfo.size; + + boolean flush = false; + if (writedSinceLastMdat >= 32 * 1024) { + flushCurrentMdat(); + writeNewMdat = true; + flush = true; + writedSinceLastMdat -= 32 * 1024; + } + + currentMp4Movie.addSample(trackIndex, dataOffset, bufferInfo); + byteBuf.position(bufferInfo.offset); + byteBuf.limit(bufferInfo.offset + bufferInfo.size); + fc.write(byteBuf); + dataOffset += bufferInfo.size; + + if (flush) { + fos.flush(); + } + } + + public int addTrack(MediaFormat mediaFormat, boolean isVideo) throws Exception { + return currentMp4Movie.addTrack(mediaFormat, isVideo); + } + + public void finishMovie(boolean error) throws Exception { + if (mdat.getContentSize() != 0) { + flushCurrentMdat(); + } + + for (Track track : currentMp4Movie.getTracks()) { + List samples = track.getSamples(); + long[] sizes = new long[samples.size()]; + for (int i = 0; i < sizes.length; i++) { + sizes[i] = samples.get(i).getSize(); + } + track2SampleSizes.put(track, sizes); + } + + Box moov = createMovieBox(currentMp4Movie); + moov.getBox(fc); + fos.flush(); + + fc.close(); + fos.close(); + } + + protected FileTypeBox createFileTypeBox() { + LinkedList minorBrands = new LinkedList(); + minorBrands.add("isom"); + minorBrands.add("3gp4"); + return new FileTypeBox("isom", 0, minorBrands); + } + + private class InterleaveChunkMdat implements Box { + private Container parent; + private long contentSize = 1024 * 1024 * 1024; + private long dataOffset = 0; + + public Container getParent() { + return parent; + } + + public long getOffset() { + return dataOffset; + } + + public void setDataOffset(long offset) { + dataOffset = offset; + } + + public void setParent(Container parent) { + this.parent = parent; + } + + public void setContentSize(long contentSize) { + this.contentSize = contentSize; + } + + public long getContentSize() { + return contentSize; + } + + public String getType() { + return "mdat"; + } + + public long getSize() { + return 16 + contentSize; + } + + private boolean isSmallBox(long contentSize) { + return (contentSize + 8) < 4294967296L; + } + + @Override + public void parse(DataSource dataSource, ByteBuffer header, long contentSize, BoxParser boxParser) throws IOException { + + } + + public void getBox(WritableByteChannel writableByteChannel) throws IOException { + ByteBuffer bb = ByteBuffer.allocate(16); + long size = getSize(); + if (isSmallBox(size)) { + IsoTypeWriter.writeUInt32(bb, size); + } else { + IsoTypeWriter.writeUInt32(bb, 1); + } + bb.put(IsoFile.fourCCtoBytes("mdat")); + if (isSmallBox(size)) { + bb.put(new byte[8]); + } else { + IsoTypeWriter.writeUInt64(bb, size); + } + bb.rewind(); + writableByteChannel.write(bb); + } + } + + public static long gcd(long a, long b) { + if (b == 0) { + return a; + } + return gcd(b, a % b); + } + + public long getTimescale(Mp4Movie mp4Movie) { + long timescale = mp4Movie.getTracks().iterator().next().getTimeScale(); + for (Track track : mp4Movie.getTracks()) { + timescale = gcd(track.getTimeScale(), timescale); + } + return timescale; + } + + protected MovieBox createMovieBox(Mp4Movie movie) { + MovieBox movieBox = new MovieBox(); + MovieHeaderBox mvhd = new MovieHeaderBox(); + + mvhd.setCreationTime(new Date()); + mvhd.setModificationTime(new Date()); + mvhd.setMatrix(Matrix.ROTATE_0); + long movieTimeScale = getTimescale(movie); + long duration = 0; + + for (Track track : movie.getTracks()) { + long tracksDuration = track.getDuration() * movieTimeScale / track.getTimeScale(); + if (tracksDuration > duration) { + duration = tracksDuration; + } + } + + mvhd.setDuration(duration); + mvhd.setTimescale(movieTimeScale); + mvhd.setNextTrackId(movie.getTracks().size() + 1); + + movieBox.addBox(mvhd); + for (Track track : movie.getTracks()) { + movieBox.addBox(createTrackBox(track, movie)); + } + return movieBox; + } + + protected TrackBox createTrackBox(Track track, Mp4Movie movie) { + TrackBox trackBox = new TrackBox(); + TrackHeaderBox tkhd = new TrackHeaderBox(); + + tkhd.setEnabled(true); + tkhd.setInMovie(true); + tkhd.setInPreview(true); + if (track.isAudio()) { + tkhd.setMatrix(Matrix.ROTATE_0); + } else { + tkhd.setMatrix(movie.getMatrix()); + } + tkhd.setAlternateGroup(0); + tkhd.setCreationTime(track.getCreationTime()); + tkhd.setDuration(track.getDuration() * getTimescale(movie) / track.getTimeScale()); + tkhd.setHeight(track.getHeight()); + tkhd.setWidth(track.getWidth()); + tkhd.setLayer(0); + tkhd.setModificationTime(new Date()); + tkhd.setTrackId(track.getTrackId() + 1); + tkhd.setVolume(track.getVolume()); + + trackBox.addBox(tkhd); + + MediaBox mdia = new MediaBox(); + trackBox.addBox(mdia); + MediaHeaderBox mdhd = new MediaHeaderBox(); + mdhd.setCreationTime(track.getCreationTime()); + mdhd.setDuration(track.getDuration()); + mdhd.setTimescale(track.getTimeScale()); + mdhd.setLanguage("eng"); + mdia.addBox(mdhd); + HandlerBox hdlr = new HandlerBox(); + hdlr.setName(track.isAudio() ? "SoundHandle" : "VideoHandle"); + hdlr.setHandlerType(track.getHandler()); + + mdia.addBox(hdlr); + + MediaInformationBox minf = new MediaInformationBox(); + minf.addBox(track.getMediaHeaderBox()); + + DataInformationBox dinf = new DataInformationBox(); + DataReferenceBox dref = new DataReferenceBox(); + dinf.addBox(dref); + DataEntryUrlBox url = new DataEntryUrlBox(); + url.setFlags(1); + dref.addBox(url); + minf.addBox(dinf); + + Box stbl = createStbl(track); + minf.addBox(stbl); + mdia.addBox(minf); + + return trackBox; + } + + protected Box createStbl(Track track) { + SampleTableBox stbl = new SampleTableBox(); + + createStsd(track, stbl); + createStts(track, stbl); + createStss(track, stbl); + createStsc(track, stbl); + createStsz(track, stbl); + createStco(track, stbl); + + return stbl; + } + + protected void createStsd(Track track, SampleTableBox stbl) { + stbl.addBox(track.getSampleDescriptionBox()); + } + + protected void createStts(Track track, SampleTableBox stbl) { + TimeToSampleBox.Entry lastEntry = null; + List entries = new ArrayList(); + + for (long delta : track.getSampleDurations()) { + if (lastEntry != null && lastEntry.getDelta() == delta) { + lastEntry.setCount(lastEntry.getCount() + 1); + } else { + lastEntry = new TimeToSampleBox.Entry(1, delta); + entries.add(lastEntry); + } + } + TimeToSampleBox stts = new TimeToSampleBox(); + stts.setEntries(entries); + stbl.addBox(stts); + } + + protected void createStss(Track track, SampleTableBox stbl) { + long[] syncSamples = track.getSyncSamples(); + if (syncSamples != null && syncSamples.length > 0) { + SyncSampleBox stss = new SyncSampleBox(); + stss.setSampleNumber(syncSamples); + stbl.addBox(stss); + } + } + + protected void createStsc(Track track, SampleTableBox stbl) { + SampleToChunkBox stsc = new SampleToChunkBox(); + stsc.setEntries(new LinkedList()); + + long lastOffset = -1; + int lastChunkNumber = 1; + int lastSampleCount = 0; + + int previousWritedChunkCount = -1; + + int samplesCount = track.getSamples().size(); + for (int a = 0; a < samplesCount; a++) { + Sample sample = track.getSamples().get(a); + long offset = sample.getOffset(); + long size = sample.getSize(); + + lastOffset = offset + size; + lastSampleCount++; + + boolean write = false; + if (a != samplesCount - 1) { + Sample nextSample = track.getSamples().get(a + 1); + if (lastOffset != nextSample.getOffset()) { + write = true; + } + } else { + write = true; + } + if (write) { + if (previousWritedChunkCount != lastSampleCount) { + stsc.getEntries().add(new SampleToChunkBox.Entry(lastChunkNumber, lastSampleCount, 1)); + previousWritedChunkCount = lastSampleCount; + } + lastSampleCount = 0; + lastChunkNumber++; + } + } + stbl.addBox(stsc); + } + + protected void createStsz(Track track, SampleTableBox stbl) { + SampleSizeBox stsz = new SampleSizeBox(); + stsz.setSampleSizes(track2SampleSizes.get(track)); + stbl.addBox(stsz); + } + + protected void createStco(Track track, SampleTableBox stbl) { + ArrayList chunksOffsets = new ArrayList(); + long lastOffset = -1; + for (Sample sample : track.getSamples()) { + long offset = sample.getOffset(); + if (lastOffset != -1 && lastOffset != offset) { + lastOffset = -1; + } + if (lastOffset == -1) { + chunksOffsets.add(offset); + } + lastOffset = offset + sample.getSize(); + } + long[] chunkOffsetsLong = new long[chunksOffsets.size()]; + for (int a = 0; a < chunksOffsets.size(); a++) { + chunkOffsetsLong[a] = chunksOffsets.get(a); + } + + StaticChunkOffsetBox stco = new StaticChunkOffsetBox(); + stco.setChunkOffsets(chunkOffsetsLong); + stbl.addBox(stco); + } +} diff --git a/TMessagesProj/src/main/java/org/telegram/android/video/Mp4Movie.java b/TMessagesProj/src/main/java/org/telegram/android/video/Mp4Movie.java new file mode 100644 index 000000000..4492bc312 --- /dev/null +++ b/TMessagesProj/src/main/java/org/telegram/android/video/Mp4Movie.java @@ -0,0 +1,81 @@ +/* + * This is the source code of Telegram for Android v. 1.7.x. + * It is licensed under GNU GPL v. 2 or later. + * You should have received a copy of the license in this archive (see LICENSE). + * + * Copyright Nikolai Kudashov, 2013-2014. + */ + +package org.telegram.android.video; + +import android.annotation.TargetApi; +import android.media.MediaCodec; +import android.media.MediaFormat; + +import com.googlecode.mp4parser.util.Matrix; + +import java.io.File; +import java.util.ArrayList; + +@TargetApi(16) +public class Mp4Movie { + private Matrix matrix = Matrix.ROTATE_0; + private ArrayList tracks = new ArrayList(); + private File cacheFile; + private int width; + private int height; + + public Matrix getMatrix() { + return matrix; + } + + public int getWidth() { + return width; + } + + public int getHeight() { + return height; + } + + public void setCacheFile(File file) { + cacheFile = file; + } + + public void setRotation(int angle) { + if (angle == 0) { + matrix = Matrix.ROTATE_0; + } else if (angle == 90) { + matrix = Matrix.ROTATE_90; + } else if (angle == 180) { + matrix = Matrix.ROTATE_180; + } else if (angle == 270) { + matrix = Matrix.ROTATE_270; + } + } + + public void setSize(int w, int h) { + width = w; + height = h; + } + + public ArrayList getTracks() { + return tracks; + } + + public File getCacheFile() { + return cacheFile; + } + + public void addSample(int trackIndex, long offset, MediaCodec.BufferInfo bufferInfo) throws Exception { + if (trackIndex < 0 || trackIndex >= tracks.size()) { + return; + } + Track track = tracks.get(trackIndex); + track.addSample(offset, bufferInfo); + } + + public int addTrack(MediaFormat mediaFormat, boolean isVideo) throws Exception { + tracks.add(new Track(tracks.size(), mediaFormat, isVideo)); + return tracks.size() - 1; + } +} diff --git a/TMessagesProj/src/main/java/org/telegram/android/video/OutputSurface.java b/TMessagesProj/src/main/java/org/telegram/android/video/OutputSurface.java new file mode 100644 index 000000000..6511d9608 --- /dev/null +++ b/TMessagesProj/src/main/java/org/telegram/android/video/OutputSurface.java @@ -0,0 +1,180 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.telegram.android.video; + +import android.annotation.TargetApi; +import android.graphics.SurfaceTexture; +import android.opengl.EGL14; +import android.view.Surface; +import javax.microedition.khronos.egl.EGL10; +import javax.microedition.khronos.egl.EGLConfig; +import javax.microedition.khronos.egl.EGLContext; +import javax.microedition.khronos.egl.EGLDisplay; +import javax.microedition.khronos.egl.EGLSurface; + +@TargetApi(17) +public class OutputSurface implements SurfaceTexture.OnFrameAvailableListener { + + private static final int EGL_OPENGL_ES2_BIT = 4; + private EGL10 mEGL; + private EGLDisplay mEGLDisplay; + private EGLContext mEGLContext; + private EGLSurface mEGLSurface; + private SurfaceTexture mSurfaceTexture; + private Surface mSurface; + private final Object mFrameSyncObject = new Object(); + private boolean mFrameAvailable; + private TextureRenderer mTextureRender; + + public OutputSurface(int width, int height) { + if (width <= 0 || height <= 0) { + throw new IllegalArgumentException(); + } + eglSetup(width, height); + makeCurrent(); + setup(); + } + + public OutputSurface() { + setup(); + } + + private void setup() { + mTextureRender = new TextureRenderer(); + mTextureRender.surfaceCreated(); + mSurfaceTexture = new SurfaceTexture(mTextureRender.getTextureId()); + mSurfaceTexture.setOnFrameAvailableListener(this); + mSurface = new Surface(mSurfaceTexture); + } + + private void eglSetup(int width, int height) { + mEGL = (EGL10) EGLContext.getEGL(); + mEGLDisplay = mEGL.eglGetDisplay(EGL10.EGL_DEFAULT_DISPLAY); + if (!mEGL.eglInitialize(mEGLDisplay, null)) { + throw new RuntimeException("unable to initialize EGL10"); + } + int[] attribList = { + EGL10.EGL_RED_SIZE, 8, + EGL10.EGL_GREEN_SIZE, 8, + EGL10.EGL_BLUE_SIZE, 8, + EGL10.EGL_SURFACE_TYPE, EGL10.EGL_PBUFFER_BIT, + EGL10.EGL_RENDERABLE_TYPE, EGL_OPENGL_ES2_BIT, + EGL10.EGL_NONE + }; + EGLConfig[] configs = new EGLConfig[1]; + int[] numConfigs = new int[1]; + if (!mEGL.eglChooseConfig(mEGLDisplay, attribList, configs, 1, numConfigs)) { + throw new RuntimeException("unable to find RGB888+pbuffer EGL config"); + } + int[] attrib_list = { + EGL14.EGL_CONTEXT_CLIENT_VERSION, 2, + EGL10.EGL_NONE + }; + mEGLContext = mEGL.eglCreateContext(mEGLDisplay, configs[0], EGL10.EGL_NO_CONTEXT, + attrib_list); + checkEglError("eglCreateContext"); + if (mEGLContext == null) { + throw new RuntimeException("null context"); + } + int[] surfaceAttribs = { + EGL10.EGL_WIDTH, width, + EGL10.EGL_HEIGHT, height, + EGL10.EGL_NONE + }; + mEGLSurface = mEGL.eglCreatePbufferSurface(mEGLDisplay, configs[0], surfaceAttribs); + checkEglError("eglCreatePbufferSurface"); + if (mEGLSurface == null) { + throw new RuntimeException("surface was null"); + } + } + + public void release() { + if (mEGL != null) { + if (mEGL.eglGetCurrentContext().equals(mEGLContext)) { + mEGL.eglMakeCurrent(mEGLDisplay, EGL10.EGL_NO_SURFACE, EGL10.EGL_NO_SURFACE, EGL10.EGL_NO_CONTEXT); + } + mEGL.eglDestroySurface(mEGLDisplay, mEGLSurface); + mEGL.eglDestroyContext(mEGLDisplay, mEGLContext); + } + mSurface.release(); + mEGLDisplay = null; + mEGLContext = null; + mEGLSurface = null; + mEGL = null; + mTextureRender = null; + mSurface = null; + mSurfaceTexture = null; + } + + public void makeCurrent() { + if (mEGL == null) { + throw new RuntimeException("not configured for makeCurrent"); + } + checkEglError("before makeCurrent"); + if (!mEGL.eglMakeCurrent(mEGLDisplay, mEGLSurface, mEGLSurface, mEGLContext)) { + throw new RuntimeException("eglMakeCurrent failed"); + } + } + + public Surface getSurface() { + return mSurface; + } + + public void changeFragmentShader(String fragmentShader) { + mTextureRender.changeFragmentShader(fragmentShader); + } + + public void awaitNewImage() { + final int TIMEOUT_MS = 500; + synchronized (mFrameSyncObject) { + while (!mFrameAvailable) { + try { + mFrameSyncObject.wait(TIMEOUT_MS); + if (!mFrameAvailable) { + throw new RuntimeException("Surface frame wait timed out"); + } + } catch (InterruptedException ie) { + throw new RuntimeException(ie); + } + } + mFrameAvailable = false; + } + mTextureRender.checkGlError("before updateTexImage"); + mSurfaceTexture.updateTexImage(); + } + + public void drawImage() { + mTextureRender.drawFrame(mSurfaceTexture); + } + + @Override + public void onFrameAvailable(SurfaceTexture st) { + synchronized (mFrameSyncObject) { + if (mFrameAvailable) { + throw new RuntimeException("mFrameAvailable already set, frame could be dropped"); + } + mFrameAvailable = true; + mFrameSyncObject.notifyAll(); + } + } + + private void checkEglError(String msg) { + if (mEGL.eglGetError() != EGL10.EGL_SUCCESS) { + throw new RuntimeException("EGL error encountered (see log)"); + } + } +} diff --git a/TMessagesProj/src/main/java/org/telegram/android/video/Sample.java b/TMessagesProj/src/main/java/org/telegram/android/video/Sample.java new file mode 100644 index 000000000..f6d7954ff --- /dev/null +++ b/TMessagesProj/src/main/java/org/telegram/android/video/Sample.java @@ -0,0 +1,27 @@ +/* + * This is the source code of Telegram for Android v. 1.7.x. + * It is licensed under GNU GPL v. 2 or later. + * You should have received a copy of the license in this archive (see LICENSE). + * + * Copyright Nikolai Kudashov, 2013-2014. + */ + +package org.telegram.android.video; + +public class Sample { + private long offset = 0; + private long size = 0; + + public Sample(long offset, long size) { + this.offset = offset; + this.size = size; + } + + public long getOffset() { + return offset; + } + + public long getSize() { + return size; + } +} diff --git a/TMessagesProj/src/main/java/org/telegram/android/video/TextureRenderer.java b/TMessagesProj/src/main/java/org/telegram/android/video/TextureRenderer.java new file mode 100644 index 000000000..7b09eeb36 --- /dev/null +++ b/TMessagesProj/src/main/java/org/telegram/android/video/TextureRenderer.java @@ -0,0 +1,200 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.telegram.android.video; + +import java.nio.ByteBuffer; +import java.nio.ByteOrder; +import java.nio.FloatBuffer; + +import android.annotation.TargetApi; +import android.graphics.SurfaceTexture; +import android.opengl.GLES11Ext; +import android.opengl.GLES20; +import android.opengl.Matrix; + +@TargetApi(17) +public class TextureRenderer { + private static final int FLOAT_SIZE_BYTES = 4; + private static final int TRIANGLE_VERTICES_DATA_STRIDE_BYTES = 5 * FLOAT_SIZE_BYTES; + private static final int TRIANGLE_VERTICES_DATA_POS_OFFSET = 0; + private static final int TRIANGLE_VERTICES_DATA_UV_OFFSET = 3; + private static final float[] mTriangleVerticesData = { + // X, Y, Z, U, V + -1.0f, -1.0f, 0, 0.f, 0.f, + 1.0f, -1.0f, 0, 1.f, 0.f, + -1.0f, 1.0f, 0, 0.f, 1.f, + 1.0f, 1.0f, 0, 1.f, 1.f, + }; + private FloatBuffer mTriangleVertices; + private static final String VERTEX_SHADER = + "uniform mat4 uMVPMatrix;\n" + + "uniform mat4 uSTMatrix;\n" + + "attribute vec4 aPosition;\n" + + "attribute vec4 aTextureCoord;\n" + + "varying vec2 vTextureCoord;\n" + + "void main() {\n" + + " gl_Position = uMVPMatrix * aPosition;\n" + + " vTextureCoord = (uSTMatrix * aTextureCoord).xy;\n" + + "}\n"; + private static final String FRAGMENT_SHADER = + "#extension GL_OES_EGL_image_external : require\n" + + "precision mediump float;\n" + // highp here doesn't seem to matter + "varying vec2 vTextureCoord;\n" + + "uniform samplerExternalOES sTexture;\n" + + "void main() {\n" + + " gl_FragColor = texture2D(sTexture, vTextureCoord);\n" + + "}\n"; + private float[] mMVPMatrix = new float[16]; + private float[] mSTMatrix = new float[16]; + private int mProgram; + private int mTextureID = -12345; + private int muMVPMatrixHandle; + private int muSTMatrixHandle; + private int maPositionHandle; + private int maTextureHandle; + + public TextureRenderer() { + mTriangleVertices = ByteBuffer.allocateDirect(mTriangleVerticesData.length * FLOAT_SIZE_BYTES).order(ByteOrder.nativeOrder()).asFloatBuffer(); + mTriangleVertices.put(mTriangleVerticesData).position(0); + Matrix.setIdentityM(mSTMatrix, 0); + } + + public int getTextureId() { + return mTextureID; + } + + public void drawFrame(SurfaceTexture st) { + checkGlError("onDrawFrame start"); + st.getTransformMatrix(mSTMatrix); + GLES20.glClearColor(0.0f, 1.0f, 0.0f, 1.0f); + GLES20.glClear(GLES20.GL_DEPTH_BUFFER_BIT | GLES20.GL_COLOR_BUFFER_BIT); + GLES20.glUseProgram(mProgram); + checkGlError("glUseProgram"); + GLES20.glActiveTexture(GLES20.GL_TEXTURE0); + GLES20.glBindTexture(GLES11Ext.GL_TEXTURE_EXTERNAL_OES, mTextureID); + mTriangleVertices.position(TRIANGLE_VERTICES_DATA_POS_OFFSET); + GLES20.glVertexAttribPointer(maPositionHandle, 3, GLES20.GL_FLOAT, false, TRIANGLE_VERTICES_DATA_STRIDE_BYTES, mTriangleVertices); + checkGlError("glVertexAttribPointer maPosition"); + GLES20.glEnableVertexAttribArray(maPositionHandle); + checkGlError("glEnableVertexAttribArray maPositionHandle"); + mTriangleVertices.position(TRIANGLE_VERTICES_DATA_UV_OFFSET); + GLES20.glVertexAttribPointer(maTextureHandle, 2, GLES20.GL_FLOAT, false, TRIANGLE_VERTICES_DATA_STRIDE_BYTES, mTriangleVertices); + checkGlError("glVertexAttribPointer maTextureHandle"); + GLES20.glEnableVertexAttribArray(maTextureHandle); + checkGlError("glEnableVertexAttribArray maTextureHandle"); + Matrix.setIdentityM(mMVPMatrix, 0); + GLES20.glUniformMatrix4fv(muMVPMatrixHandle, 1, false, mMVPMatrix, 0); + GLES20.glUniformMatrix4fv(muSTMatrixHandle, 1, false, mSTMatrix, 0); + GLES20.glDrawArrays(GLES20.GL_TRIANGLE_STRIP, 0, 4); + checkGlError("glDrawArrays"); + GLES20.glFinish(); + } + + public void surfaceCreated() { + mProgram = createProgram(VERTEX_SHADER, FRAGMENT_SHADER); + if (mProgram == 0) { + throw new RuntimeException("failed creating program"); + } + maPositionHandle = GLES20.glGetAttribLocation(mProgram, "aPosition"); + checkGlError("glGetAttribLocation aPosition"); + if (maPositionHandle == -1) { + throw new RuntimeException("Could not get attrib location for aPosition"); + } + maTextureHandle = GLES20.glGetAttribLocation(mProgram, "aTextureCoord"); + checkGlError("glGetAttribLocation aTextureCoord"); + if (maTextureHandle == -1) { + throw new RuntimeException("Could not get attrib location for aTextureCoord"); + } + muMVPMatrixHandle = GLES20.glGetUniformLocation(mProgram, "uMVPMatrix"); + checkGlError("glGetUniformLocation uMVPMatrix"); + if (muMVPMatrixHandle == -1) { + throw new RuntimeException("Could not get attrib location for uMVPMatrix"); + } + muSTMatrixHandle = GLES20.glGetUniformLocation(mProgram, "uSTMatrix"); + checkGlError("glGetUniformLocation uSTMatrix"); + if (muSTMatrixHandle == -1) { + throw new RuntimeException("Could not get attrib location for uSTMatrix"); + } + int[] textures = new int[1]; + GLES20.glGenTextures(1, textures, 0); + mTextureID = textures[0]; + GLES20.glBindTexture(GLES11Ext.GL_TEXTURE_EXTERNAL_OES, mTextureID); + checkGlError("glBindTexture mTextureID"); + GLES20.glTexParameterf(GLES11Ext.GL_TEXTURE_EXTERNAL_OES, GLES20.GL_TEXTURE_MIN_FILTER, GLES20.GL_NEAREST); + GLES20.glTexParameterf(GLES11Ext.GL_TEXTURE_EXTERNAL_OES, GLES20.GL_TEXTURE_MAG_FILTER, GLES20.GL_LINEAR); + GLES20.glTexParameteri(GLES11Ext.GL_TEXTURE_EXTERNAL_OES, GLES20.GL_TEXTURE_WRAP_S, GLES20.GL_CLAMP_TO_EDGE); + GLES20.glTexParameteri(GLES11Ext.GL_TEXTURE_EXTERNAL_OES, GLES20.GL_TEXTURE_WRAP_T, GLES20.GL_CLAMP_TO_EDGE); + checkGlError("glTexParameter"); + } + + public void changeFragmentShader(String fragmentShader) { + GLES20.glDeleteProgram(mProgram); + mProgram = createProgram(VERTEX_SHADER, fragmentShader); + if (mProgram == 0) { + throw new RuntimeException("failed creating program"); + } + } + + private int loadShader(int shaderType, String source) { + int shader = GLES20.glCreateShader(shaderType); + checkGlError("glCreateShader type=" + shaderType); + GLES20.glShaderSource(shader, source); + GLES20.glCompileShader(shader); + int[] compiled = new int[1]; + GLES20.glGetShaderiv(shader, GLES20.GL_COMPILE_STATUS, compiled, 0); + if (compiled[0] == 0) { + GLES20.glDeleteShader(shader); + shader = 0; + } + return shader; + } + + private int createProgram(String vertexSource, String fragmentSource) { + int vertexShader = loadShader(GLES20.GL_VERTEX_SHADER, vertexSource); + if (vertexShader == 0) { + return 0; + } + int pixelShader = loadShader(GLES20.GL_FRAGMENT_SHADER, fragmentSource); + if (pixelShader == 0) { + return 0; + } + int program = GLES20.glCreateProgram(); + checkGlError("glCreateProgram"); + if (program == 0) { + return 0; + } + GLES20.glAttachShader(program, vertexShader); + checkGlError("glAttachShader"); + GLES20.glAttachShader(program, pixelShader); + checkGlError("glAttachShader"); + GLES20.glLinkProgram(program); + int[] linkStatus = new int[1]; + GLES20.glGetProgramiv(program, GLES20.GL_LINK_STATUS, linkStatus, 0); + if (linkStatus[0] != GLES20.GL_TRUE) { + GLES20.glDeleteProgram(program); + program = 0; + } + return program; + } + + public void checkGlError(String op) { + int error; + if ((error = GLES20.glGetError()) != GLES20.GL_NO_ERROR) { + throw new RuntimeException(op + ": glError " + error); + } + } +} diff --git a/TMessagesProj/src/main/java/org/telegram/android/video/Track.java b/TMessagesProj/src/main/java/org/telegram/android/video/Track.java new file mode 100644 index 000000000..b2eabafdf --- /dev/null +++ b/TMessagesProj/src/main/java/org/telegram/android/video/Track.java @@ -0,0 +1,247 @@ +/* + * This is the source code of Telegram for Android v. 1.7.x. + * It is licensed under GNU GPL v. 2 or later. + * You should have received a copy of the license in this archive (see LICENSE). + * + * Copyright Nikolai Kudashov, 2013-2014. + */ + +package org.telegram.android.video; + +import android.annotation.TargetApi; +import android.media.MediaCodec; +import android.media.MediaFormat; + +import com.coremedia.iso.boxes.AbstractMediaHeaderBox; +import com.coremedia.iso.boxes.SampleDescriptionBox; +import com.coremedia.iso.boxes.SoundMediaHeaderBox; +import com.coremedia.iso.boxes.VideoMediaHeaderBox; +import com.coremedia.iso.boxes.h264.AvcConfigurationBox; +import com.coremedia.iso.boxes.sampleentry.AudioSampleEntry; +import com.coremedia.iso.boxes.sampleentry.VisualSampleEntry; +import com.googlecode.mp4parser.boxes.mp4.ESDescriptorBox; +import com.googlecode.mp4parser.boxes.mp4.objectdescriptors.AudioSpecificConfig; +import com.googlecode.mp4parser.boxes.mp4.objectdescriptors.DecoderConfigDescriptor; +import com.googlecode.mp4parser.boxes.mp4.objectdescriptors.ESDescriptor; +import com.googlecode.mp4parser.boxes.mp4.objectdescriptors.SLConfigDescriptor; + +import java.nio.ByteBuffer; +import java.util.ArrayList; +import java.util.Date; +import java.util.HashMap; +import java.util.LinkedList; +import java.util.Map; + +@TargetApi(16) +public class Track { + private long trackId = 0; + private ArrayList samples = new ArrayList(); + private long duration = 0; + private String handler; + private AbstractMediaHeaderBox headerBox = null; + private SampleDescriptionBox sampleDescriptionBox = null; + private LinkedList syncSamples = null; + private int timeScale; + private Date creationTime = new Date(); + private int height; + private int width; + private float volume = 0; + private ArrayList sampleDurations = new ArrayList(); + private boolean isAudio = false; + private static Map samplingFrequencyIndexMap = new HashMap(); + private long lastPresentationTimeUs = 0; + private boolean first = true; + + static { + samplingFrequencyIndexMap.put(96000, 0x0); + samplingFrequencyIndexMap.put(88200, 0x1); + samplingFrequencyIndexMap.put(64000, 0x2); + samplingFrequencyIndexMap.put(48000, 0x3); + samplingFrequencyIndexMap.put(44100, 0x4); + samplingFrequencyIndexMap.put(32000, 0x5); + samplingFrequencyIndexMap.put(24000, 0x6); + samplingFrequencyIndexMap.put(22050, 0x7); + samplingFrequencyIndexMap.put(16000, 0x8); + samplingFrequencyIndexMap.put(12000, 0x9); + samplingFrequencyIndexMap.put(11025, 0xa); + samplingFrequencyIndexMap.put(8000, 0xb); + } + + public Track(int id, MediaFormat format, boolean isVideo) throws Exception { + trackId = id; + if (isVideo) { + sampleDurations.add((long)3015); + duration = 3015; + width = format.getInteger(MediaFormat.KEY_WIDTH); + height = format.getInteger(MediaFormat.KEY_HEIGHT); + timeScale = 90000; + syncSamples = new LinkedList(); + handler = "vide"; + headerBox = new VideoMediaHeaderBox(); + sampleDescriptionBox = new SampleDescriptionBox(); + VisualSampleEntry visualSampleEntry = new VisualSampleEntry("avc1"); + visualSampleEntry.setDataReferenceIndex(1); + visualSampleEntry.setDepth(24); + visualSampleEntry.setFrameCount(1); + visualSampleEntry.setHorizresolution(72); + visualSampleEntry.setVertresolution(72); + visualSampleEntry.setWidth(width); + visualSampleEntry.setHeight(height); + + AvcConfigurationBox avcConfigurationBox = new AvcConfigurationBox(); + + ArrayList spsArray = new ArrayList(); + ByteBuffer spsBuff = format.getByteBuffer("csd-0"); + spsBuff.position(4); + byte[] spsBytes = new byte[spsBuff.remaining()]; + spsBuff.get(spsBytes); + spsArray.add(spsBytes); + + ArrayList ppsArray = new ArrayList(); + ByteBuffer ppsBuff = format.getByteBuffer("csd-1"); + ppsBuff.position(4); + byte[] ppsBytes = new byte[ppsBuff.remaining()]; + ppsBuff.get(ppsBytes); + ppsArray.add(ppsBytes); + //ByteArrayInputStream byteArrayInputStream = new ByteArrayInputStream(spsBytes); + //SeqParameterSet seqParameterSet = SeqParameterSet.read(byteArrayInputStream); + + avcConfigurationBox.setSequenceParameterSets(spsArray); + avcConfigurationBox.setPictureParameterSets(ppsArray); + avcConfigurationBox.setAvcLevelIndication(13); + avcConfigurationBox.setAvcProfileIndication(100); + avcConfigurationBox.setBitDepthLumaMinus8(-1); + avcConfigurationBox.setBitDepthChromaMinus8(-1); + avcConfigurationBox.setChromaFormat(-1); + avcConfigurationBox.setConfigurationVersion(1); + avcConfigurationBox.setLengthSizeMinusOne(3); + avcConfigurationBox.setProfileCompatibility(0); + + visualSampleEntry.addBox(avcConfigurationBox); + sampleDescriptionBox.addBox(visualSampleEntry); + } else { + sampleDurations.add((long)1024); + duration = 1024; + isAudio = true; + volume = 1; + timeScale = format.getInteger(MediaFormat.KEY_SAMPLE_RATE); + handler = "soun"; + headerBox = new SoundMediaHeaderBox(); + sampleDescriptionBox = new SampleDescriptionBox(); + AudioSampleEntry audioSampleEntry = new AudioSampleEntry("mp4a"); + audioSampleEntry.setChannelCount(format.getInteger(MediaFormat.KEY_CHANNEL_COUNT)); + audioSampleEntry.setSampleRate(format.getInteger(MediaFormat.KEY_SAMPLE_RATE)); + audioSampleEntry.setDataReferenceIndex(1); + audioSampleEntry.setSampleSize(16); + + ESDescriptorBox esds = new ESDescriptorBox(); + ESDescriptor descriptor = new ESDescriptor(); + descriptor.setEsId(0); + + SLConfigDescriptor slConfigDescriptor = new SLConfigDescriptor(); + slConfigDescriptor.setPredefined(2); + descriptor.setSlConfigDescriptor(slConfigDescriptor); + + DecoderConfigDescriptor decoderConfigDescriptor = new DecoderConfigDescriptor(); + decoderConfigDescriptor.setObjectTypeIndication(0x40); + decoderConfigDescriptor.setStreamType(5); + decoderConfigDescriptor.setBufferSizeDB(1536); + decoderConfigDescriptor.setMaxBitRate(96000); + decoderConfigDescriptor.setAvgBitRate(96000); + + AudioSpecificConfig audioSpecificConfig = new AudioSpecificConfig(); + audioSpecificConfig.setAudioObjectType(2); + audioSpecificConfig.setSamplingFrequencyIndex(samplingFrequencyIndexMap.get((int)audioSampleEntry.getSampleRate())); + audioSpecificConfig.setChannelConfiguration(audioSampleEntry.getChannelCount()); + decoderConfigDescriptor.setAudioSpecificInfo(audioSpecificConfig); + + descriptor.setDecoderConfigDescriptor(decoderConfigDescriptor); + + ByteBuffer data = descriptor.serialize(); + esds.setEsDescriptor(descriptor); + esds.setData(data); + audioSampleEntry.addBox(esds); + sampleDescriptionBox.addBox(audioSampleEntry); + } + } + + public long getTrackId() { + return trackId; + } + + public void addSample(long offset, MediaCodec.BufferInfo bufferInfo) { + boolean isSyncFrame = !isAudio && (bufferInfo.flags & MediaCodec.BUFFER_FLAG_SYNC_FRAME) != 0; + samples.add(new Sample(offset, bufferInfo.size)); + if (syncSamples != null && isSyncFrame) { + syncSamples.add(samples.size()); + } + + long delta = bufferInfo.presentationTimeUs - lastPresentationTimeUs; + lastPresentationTimeUs = bufferInfo.presentationTimeUs; + delta = (delta * timeScale + 500000L) / 1000000L; + if (!first) { + sampleDurations.add(sampleDurations.size() - 1, delta); + duration += delta; + } + first = false; + } + + public ArrayList getSamples() { + return samples; + } + + public long getDuration() { + return duration; + } + + public String getHandler() { + return handler; + } + + public AbstractMediaHeaderBox getMediaHeaderBox() { + return headerBox; + } + + public SampleDescriptionBox getSampleDescriptionBox() { + return sampleDescriptionBox; + } + + public long[] getSyncSamples() { + if (syncSamples == null || syncSamples.isEmpty()) { + return null; + } + long[] returns = new long[syncSamples.size()]; + for (int i = 0; i < syncSamples.size(); i++) { + returns[i] = syncSamples.get(i); + } + return returns; + } + + public int getTimeScale() { + return timeScale; + } + + public Date getCreationTime() { + return creationTime; + } + + public int getWidth() { + return width; + } + + public int getHeight() { + return height; + } + + public float getVolume() { + return volume; + } + + public ArrayList getSampleDurations() { + return sampleDurations; + } + + public boolean isAudio() { + return isAudio; + } +} diff --git a/TMessagesProj/src/main/java/org/telegram/messenger/ConnectionsManager.java b/TMessagesProj/src/main/java/org/telegram/messenger/ConnectionsManager.java index 5acd0bcf3..e87e5d089 100644 --- a/TMessagesProj/src/main/java/org/telegram/messenger/ConnectionsManager.java +++ b/TMessagesProj/src/main/java/org/telegram/messenger/ConnectionsManager.java @@ -19,6 +19,7 @@ import android.util.Base64; import org.telegram.android.AndroidUtilities; import org.telegram.android.ContactsController; +import org.telegram.android.LocaleController; import org.telegram.android.MessagesController; import org.telegram.android.NotificationCenter; import org.telegram.ui.ApplicationLoader; @@ -410,7 +411,7 @@ public class ConnectionsManager implements Action.ActionDelegate, TcpConnection. if (currentDatacenterId != 0 && UserConfig.isClientActivated()) { Datacenter datacenter = datacenterWithId(currentDatacenterId); - if (datacenter.authKey == null) { + if (datacenter == null || datacenter.authKey == null) { currentDatacenterId = 0; datacenters.clear(); UserConfig.clearConfig(); @@ -800,7 +801,7 @@ public class ConnectionsManager implements Action.ActionDelegate, TcpConnection. invoke.query = object; invoke.api_id = BuildVars.APP_ID; try { - invoke.lang_code = Locale.getDefault().getCountry(); + invoke.lang_code = LocaleController.getLocaleString(Locale.getDefault()); invoke.device_model = Build.MANUFACTURER + Build.MODEL; if (invoke.device_model == null) { invoke.device_model = "Android unknown"; @@ -1778,7 +1779,7 @@ public class ConnectionsManager implements Action.ActionDelegate, TcpConnection. req.token = "" + pushSessionId; req.app_sandbox = false; try { - req.lang_code = Locale.getDefault().getCountry(); + req.lang_code = LocaleController.getLocaleString(Locale.getDefault()); req.device_model = Build.MANUFACTURER + Build.MODEL; if (req.device_model == null) { req.device_model = "Android unknown"; diff --git a/TMessagesProj/src/main/java/org/telegram/messenger/FileLoader.java b/TMessagesProj/src/main/java/org/telegram/messenger/FileLoader.java index 256666ef5..1f18873e4 100644 --- a/TMessagesProj/src/main/java/org/telegram/messenger/FileLoader.java +++ b/TMessagesProj/src/main/java/org/telegram/messenger/FileLoader.java @@ -83,7 +83,28 @@ public class FileLoader { return fileProgresses.get(location); } + public void checkUploadNewDataAvailable(final String location, final boolean encrypted, final long finalSize) { + fileLoaderQueue.postRunnable(new Runnable() { + @Override + public void run() { + FileUploadOperation operation = null; + if (encrypted) { + operation = uploadOperationPathsEnc.get(location); + } else { + operation = uploadOperationPaths.get(location); + } + if (operation != null) { + operation.checkNewDataAvailable(finalSize); + } + } + }); + } + public void uploadFile(final String location, final boolean encrypted, final boolean small) { + uploadFile(location, encrypted, small, 0); + } + + public void uploadFile(final String location, final boolean encrypted, final boolean small, final int estimatedSize) { fileLoaderQueue.postRunnable(new Runnable() { @Override public void run() { @@ -96,7 +117,7 @@ public class FileLoader { return; } } - FileUploadOperation operation = new FileUploadOperation(location, encrypted); + FileUploadOperation operation = new FileUploadOperation(location, encrypted, estimatedSize); if (encrypted) { uploadOperationPathsEnc.put(location, operation); } else { diff --git a/TMessagesProj/src/main/java/org/telegram/messenger/FileUploadOperation.java b/TMessagesProj/src/main/java/org/telegram/messenger/FileUploadOperation.java index d967d7b88..c78587bb1 100644 --- a/TMessagesProj/src/main/java/org/telegram/messenger/FileUploadOperation.java +++ b/TMessagesProj/src/main/java/org/telegram/messenger/FileUploadOperation.java @@ -41,6 +41,7 @@ public class FileUploadOperation { private int fingerprint = 0; private boolean isBigFile = false; private String fileKey; + private int estimatedSize = 0; FileInputStream stream; MessageDigest mdEnc = null; @@ -50,9 +51,10 @@ public class FileUploadOperation { public abstract void didChangedUploadProgress(FileUploadOperation operation, float progress); } - public FileUploadOperation(String location, boolean encrypted) { + public FileUploadOperation(String location, boolean encrypted, int estimated) { uploadingFilePath = location; isEncrypted = encrypted; + estimatedSize = estimated; } public void start() { @@ -60,7 +62,12 @@ public class FileUploadOperation { return; } state = 1; - startUploadRequest(); + Utilities.stageQueue.postRunnable(new Runnable() { + @Override + public void run() { + startUploadRequest(); + } + }); } public void cancel() { @@ -86,6 +93,22 @@ public class FileUploadOperation { remove(fileKey + "_ivc").commit(); } + public void checkNewDataAvailable(final long finalSize) { + Utilities.stageQueue.postRunnable(new Runnable() { + @Override + public void run() { + if (finalSize != 0) { + estimatedSize = 0; + totalFileSize = finalSize; + totalPartsCount = (int) Math.ceil((float) totalFileSize / (float) uploadChunkSize); + } + if (requestToken == 0) { + startUploadRequest(); + } + } + }); + } + private void startUploadRequest() { if (state != 1) { return; @@ -97,7 +120,11 @@ public class FileUploadOperation { if (stream == null) { File cacheFile = new File(uploadingFilePath); stream = new FileInputStream(cacheFile); - totalFileSize = cacheFile.length(); + if (estimatedSize != 0) { + totalFileSize = estimatedSize; + } else { + totalFileSize = cacheFile.length(); + } if (totalFileSize > 10 * 1024 * 1024) { isBigFile = true; } else { @@ -126,7 +153,7 @@ public class FileUploadOperation { long fileSize = preferences.getLong(fileKey + "_size", 0); int currentTime = (int)(System.currentTimeMillis() / 1000); boolean rewrite = false; - if (fileSize == totalFileSize) { + if (estimatedSize == 0 && fileSize == totalFileSize) { currentFileId = preferences.getLong(fileKey + "_id", 0); int date = preferences.getInt(fileKey + "_time", 0); long uploadedSize = preferences.getLong(fileKey + "_uploaded", 0); @@ -207,17 +234,19 @@ public class FileUploadOperation { System.arraycopy(iv, 0, ivChange, 0, 32); } currentFileId = Utilities.random.nextLong(); - SharedPreferences.Editor editor = preferences.edit(); - editor.putInt(fileKey + "_time", currentTime); - editor.putLong(fileKey + "_size", totalFileSize); - editor.putLong(fileKey + "_id", currentFileId); - editor.remove(fileKey + "_uploaded"); - if (isEncrypted) { - editor.putString(fileKey + "_iv", Utilities.bytesToHex(iv)); - editor.putString(fileKey + "_ivc", Utilities.bytesToHex(ivChange)); - editor.putString(fileKey + "_key", Utilities.bytesToHex(key)); + if (estimatedSize == 0) { + SharedPreferences.Editor editor = preferences.edit(); + editor.putInt(fileKey + "_time", currentTime); + editor.putLong(fileKey + "_size", totalFileSize); + editor.putLong(fileKey + "_id", currentFileId); + editor.remove(fileKey + "_uploaded"); + if (isEncrypted) { + editor.putString(fileKey + "_iv", Utilities.bytesToHex(iv)); + editor.putString(fileKey + "_ivc", Utilities.bytesToHex(ivChange)); + editor.putString(fileKey + "_key", Utilities.bytesToHex(key)); + } + editor.commit(); } - editor.commit(); } if (isEncrypted) { @@ -234,7 +263,7 @@ public class FileUploadOperation { FileLog.e("tmessages", e); } } - } else { + } else if (estimatedSize == 0) { if (saveInfoTimes >= 4) { saveInfoTimes = 0; } @@ -250,13 +279,20 @@ public class FileUploadOperation { saveInfoTimes++; } + if (estimatedSize != 0) { + long size = stream.getChannel().size(); + if (currentUploaded + uploadChunkSize > size) { + return; + } + } + int read = stream.read(readBuffer); int toAdd = 0; if (isEncrypted && read % 16 != 0) { toAdd += 16 - read % 16; } ByteBufferDesc sendBuffer = BuffersStorage.getInstance().getFreeBuffer(read + toAdd); - if (read != uploadChunkSize || totalPartsCount == currentPartNum + 1) { + if (read != uploadChunkSize || estimatedSize == 0 && totalPartsCount == currentPartNum + 1) { isLastPart = true; } sendBuffer.writeRaw(readBuffer, 0, read); @@ -274,7 +310,11 @@ public class FileUploadOperation { TLRPC.TL_upload_saveBigFilePart req = new TLRPC.TL_upload_saveBigFilePart(); req.file_part = currentPartNum; req.file_id = currentFileId; - req.file_total_parts = totalPartsCount; + if (estimatedSize != 0) { + req.file_total_parts = -1; + } else { + req.file_total_parts = totalPartsCount; + } req.bytes = sendBuffer; finalRequest = req; } else { diff --git a/TMessagesProj/src/main/java/org/telegram/messenger/TLRPC.java b/TMessagesProj/src/main/java/org/telegram/messenger/TLRPC.java index ca4be6f9a..e5417040e 100644 --- a/TMessagesProj/src/main/java/org/telegram/messenger/TLRPC.java +++ b/TMessagesProj/src/main/java/org/telegram/messenger/TLRPC.java @@ -4145,7 +4145,10 @@ public class TLRPC { stream.readInt32(); int count = stream.readInt32(); for (int a = 0; a < count; a++) { - sizes.add((PhotoSize)TLClassStore.Instance().TLdeserialize(stream, stream.readInt32())); + PhotoSize size = (PhotoSize)TLClassStore.Instance().TLdeserialize(stream, stream.readInt32()); + if (size != null) { + sizes.add(size); + } } } @@ -8967,6 +8970,7 @@ public class TLRPC { public String path; public byte[] key; public byte[] iv; + public boolean estimatedSize; } public static class Document extends TLObject { diff --git a/TMessagesProj/src/main/java/org/telegram/messenger/Utilities.java b/TMessagesProj/src/main/java/org/telegram/messenger/Utilities.java index 163b54a69..88cb53573 100644 --- a/TMessagesProj/src/main/java/org/telegram/messenger/Utilities.java +++ b/TMessagesProj/src/main/java/org/telegram/messenger/Utilities.java @@ -131,7 +131,7 @@ public class Utilities { public native static long doPQNative(long _what); public native static void loadBitmap(String path, int[] bitmap, int scale, int format, int width, int height); - public native static void blurBitmap(Object bitmap, int width, int height, int stride); + public native static void blurBitmap(Object bitmap); private native static void aesIgeEncryption(ByteBuffer buffer, byte[] key, byte[] iv, boolean encrypt, int offset, int length); public static void aesIgeEncryption(ByteBuffer buffer, byte[] key, byte[] iv, boolean encrypt, boolean changeIv, int offset, int length) { diff --git a/TMessagesProj/src/main/java/org/telegram/ui/Cells/ChatBaseCell.java b/TMessagesProj/src/main/java/org/telegram/ui/Cells/ChatBaseCell.java index 41365d4e0..f7e73fc5b 100644 --- a/TMessagesProj/src/main/java/org/telegram/ui/Cells/ChatBaseCell.java +++ b/TMessagesProj/src/main/java/org/telegram/ui/Cells/ChatBaseCell.java @@ -575,7 +575,7 @@ public class ChatBaseCell extends BaseCell { drawClock = false; drawError = true; } else if (currentMessageObject.messageOwner.send_state == MessageObject.MESSAGE_SEND_STATE_SENT) { - if (!currentMessageObject.messageOwner.unread) { + if (!currentMessageObject.isUnread()) { drawCheck1 = true; drawCheck2 = true; } else { diff --git a/TMessagesProj/src/main/java/org/telegram/ui/Cells/ChatMediaCell.java b/TMessagesProj/src/main/java/org/telegram/ui/Cells/ChatMediaCell.java index dfb635bf6..4679d766c 100644 --- a/TMessagesProj/src/main/java/org/telegram/ui/Cells/ChatMediaCell.java +++ b/TMessagesProj/src/main/java/org/telegram/ui/Cells/ChatMediaCell.java @@ -345,7 +345,6 @@ public class ChatMediaCell extends ChatBaseCell implements MediaController.FileD int maxWidth = Math.min(AndroidUtilities.displaySize.x, AndroidUtilities.displaySize.y) - AndroidUtilities.dp(122 + 86 + 24); if (currentNameString == null || !currentNameString.equals(name)) { currentNameString = name; - nameWidth = (int) Math.ceil(namePaint.measureText(currentNameString)); nameWidth = Math.min(maxWidth, (int) Math.ceil(namePaint.measureText(currentNameString))); CharSequence str = TextUtils.ellipsize(currentNameString, namePaint, nameWidth, TextUtils.TruncateAt.END); nameLayout = new StaticLayout(str, namePaint, nameWidth, Layout.Alignment.ALIGN_NORMAL, 1.0f, 0.0f, false); diff --git a/TMessagesProj/src/main/java/org/telegram/ui/Cells/DialogCell.java b/TMessagesProj/src/main/java/org/telegram/ui/Cells/DialogCell.java index cfaadacff..640177455 100644 --- a/TMessagesProj/src/main/java/org/telegram/ui/Cells/DialogCell.java +++ b/TMessagesProj/src/main/java/org/telegram/ui/Cells/DialogCell.java @@ -20,6 +20,7 @@ import android.text.TextUtils; import org.telegram.android.AndroidUtilities; import org.telegram.PhoneFormat.PhoneFormat; import org.telegram.android.LocaleController; +import org.telegram.messenger.FileLog; import org.telegram.messenger.TLRPC; import org.telegram.android.ContactsController; import org.telegram.android.Emoji; @@ -288,10 +289,12 @@ public class DialogCell extends BaseCell { broadcastDrawable.draw(canvas); } - canvas.save(); - canvas.translate(cellLayout.nameLeft, cellLayout.nameTop); - cellLayout.nameLayout.draw(canvas); - canvas.restore(); + if (cellLayout.nameLayout != null) { + canvas.save(); + canvas.translate(cellLayout.nameLeft, cellLayout.nameTop); + cellLayout.nameLayout.draw(canvas); + canvas.restore(); + } canvas.save(); canvas.translate(cellLayout.timeLeft, cellLayout.timeTop); @@ -530,7 +533,7 @@ public class DialogCell extends BaseCell { drawError = true; drawCount = false; } else if (message.messageOwner.send_state == MessageObject.MESSAGE_SEND_STATE_SENT) { - if (!message.messageOwner.unread) { + if (!message.isUnread()) { drawCheck1 = true; drawCheck2 = true; } else { @@ -627,7 +630,11 @@ public class DialogCell extends BaseCell { } CharSequence nameStringFinal = TextUtils.ellipsize(nameString.replace("\n", " "), currentNamePaint, nameWidth - AndroidUtilities.dp(12), TextUtils.TruncateAt.END); - nameLayout = new StaticLayout(nameStringFinal, currentNamePaint, nameWidth, Layout.Alignment.ALIGN_NORMAL, 1.0f, 0.0f, false); + try { + nameLayout = new StaticLayout(nameStringFinal, currentNamePaint, nameWidth, Layout.Alignment.ALIGN_NORMAL, 1.0f, 0.0f, false); + } catch (Exception e) { + FileLog.e("tmessages", e); + } messageWidth = width - AndroidUtilities.dp(88); if (!LocaleController.isRTL) { @@ -680,7 +687,7 @@ public class DialogCell extends BaseCell { double widthpx = 0; float left = 0; if (LocaleController.isRTL) { - if (nameLayout.getLineCount() > 0) { + if (nameLayout != null && nameLayout.getLineCount() > 0) { left = nameLayout.getLineLeft(0); if (left == 0) { widthpx = Math.ceil(nameLayout.getLineWidth(0)); @@ -699,7 +706,7 @@ public class DialogCell extends BaseCell { } } } else { - if (nameLayout.getLineCount() > 0) { + if (nameLayout != null && nameLayout.getLineCount() > 0) { left = nameLayout.getLineRight(0); if (left == nameWidth) { widthpx = Math.ceil(nameLayout.getLineWidth(0)); diff --git a/TMessagesProj/src/main/java/org/telegram/ui/ChatActivity.java b/TMessagesProj/src/main/java/org/telegram/ui/ChatActivity.java index 75d0f0748..c27ab232c 100644 --- a/TMessagesProj/src/main/java/org/telegram/ui/ChatActivity.java +++ b/TMessagesProj/src/main/java/org/telegram/ui/ChatActivity.java @@ -55,6 +55,7 @@ import org.telegram.android.MediaController; import org.telegram.android.MessagesStorage; import org.telegram.android.NotificationsController; import org.telegram.android.SendMessagesHelper; +import org.telegram.messenger.FileLoader; import org.telegram.messenger.TLRPC; import org.telegram.android.ContactsController; import org.telegram.messenger.FileLog; @@ -324,7 +325,7 @@ public class ChatActivity extends BaseFragment implements NotificationCenter.Not NotificationCenter.getInstance().addObserver(this, NotificationCenter.updateInterfaces); NotificationCenter.getInstance().addObserver(this, NotificationCenter.didReceivedNewMessages); NotificationCenter.getInstance().addObserver(this, NotificationCenter.closeChats); - NotificationCenter.getInstance().addObserver(this, NotificationCenter.messagesReaded); + NotificationCenter.getInstance().addObserver(this, NotificationCenter.messagesRead); NotificationCenter.getInstance().addObserver(this, NotificationCenter.messagesDeleted); NotificationCenter.getInstance().addObserver(this, NotificationCenter.messageReceivedByServer); NotificationCenter.getInstance().addObserver(this, NotificationCenter.messageReceivedByAck); @@ -361,7 +362,7 @@ public class ChatActivity extends BaseFragment implements NotificationCenter.Not NotificationCenter.getInstance().removeObserver(this, NotificationCenter.updateInterfaces); NotificationCenter.getInstance().removeObserver(this, NotificationCenter.didReceivedNewMessages); NotificationCenter.getInstance().removeObserver(this, NotificationCenter.closeChats); - NotificationCenter.getInstance().removeObserver(this, NotificationCenter.messagesReaded); + NotificationCenter.getInstance().removeObserver(this, NotificationCenter.messagesRead); NotificationCenter.getInstance().removeObserver(this, NotificationCenter.messagesDeleted); NotificationCenter.getInstance().removeObserver(this, NotificationCenter.messageReceivedByServer); NotificationCenter.getInstance().removeObserver(this, NotificationCenter.messageReceivedByAck); @@ -1330,14 +1331,14 @@ public class ChatActivity extends BaseFragment implements NotificationCenter.Not } currentPicturePath = null; } - /*if(android.os.Build.VERSION.SDK_INT >= 10) { + /*if(android.os.Build.VERSION.SDK_INT >= 18) { Bundle args = new Bundle(); args.putString("videoPath", videoPath); VideoEditorActivity fragment = new VideoEditorActivity(args); fragment.setDelegate(this); presentFragment(fragment); } else {*/ - processSendingVideo(videoPath); + processSendingVideo(videoPath, null, 0, 0, 0, 0); //} } else if (requestCode == 21) { if (data == null || data.getData() == null) { @@ -1360,8 +1361,13 @@ public class ChatActivity extends BaseFragment implements NotificationCenter.Not } @Override - public void didFinishedVideoConverting(String videoPath) { - processSendingVideo(videoPath); + public void didStartVideoConverting(String videoPath, String originalPath, long esimatedSize, int duration, int width, int height) { + processSendingVideo(videoPath, originalPath, esimatedSize, duration, width, height); + } + + @Override + public void didAppenedVideoData(String videoPath, long finalSize) { + FileLoader.getInstance().checkUploadNewDataAvailable(videoPath, currentEncryptedChat != null, finalSize); } private void showAttachmentError() { @@ -1599,19 +1605,29 @@ public class ChatActivity extends BaseFragment implements NotificationCenter.Not }).start(); } - public void processSendingVideo(final String videoPath) { + public void processSendingVideo(final String videoPath, final String originalFile, final long estimatedSize, final int duration, final int width, final int height) { if (videoPath == null || videoPath.length() == 0) { return; } new Thread(new Runnable() { @Override public void run() { - String originalPath = videoPath; + String originalPath = null; + if (originalFile != null) { + originalPath = originalFile; + } else { + originalPath = videoPath; + } File temp = new File(originalPath); originalPath += temp.length() + "_" + temp.lastModified(); - TLRPC.TL_video video = (TLRPC.TL_video)MessagesStorage.getInstance().getSentFile(originalPath, currentEncryptedChat == null ? 2 : 5); + TLRPC.TL_video video = null;// (TLRPC.TL_video)MessagesStorage.getInstance().getSentFile(originalPath, currentEncryptedChat == null ? 2 : 5); if (video == null) { - Bitmap thumb = ThumbnailUtils.createVideoThumbnail(videoPath, MediaStore.Video.Thumbnails.MINI_KIND); + Bitmap thumb = null; + if (originalFile != null) { + thumb = ThumbnailUtils.createVideoThumbnail(originalFile, MediaStore.Video.Thumbnails.MINI_KIND); + } else { + thumb = ThumbnailUtils.createVideoThumbnail(videoPath, MediaStore.Video.Thumbnails.MINI_KIND); + } TLRPC.PhotoSize size = ImageLoader.scaleAndSaveImage(thumb, 90, 90, 55, currentEncryptedChat != null); if (size == null) { return; @@ -1622,20 +1638,30 @@ public class ChatActivity extends BaseFragment implements NotificationCenter.Not video.caption = ""; video.mime_type = "video/mp4"; video.id = 0; - if (temp != null && temp.exists()) { - video.size = (int) temp.length(); + if (estimatedSize != 0) { + video.size = (int)estimatedSize; + } else { + if (temp != null && temp.exists()) { + video.size = (int) temp.length(); + } } - UserConfig.lastLocalId--; UserConfig.saveConfig(false); - MediaPlayer mp = MediaPlayer.create(ApplicationLoader.applicationContext, Uri.fromFile(new File(videoPath))); - if (mp == null) { - return; + if (duration != 0) { + video.duration = duration / 1000; + video.w = width; + video.h = height; + video.estimatedSize = true; + } else { + MediaPlayer mp = MediaPlayer.create(ApplicationLoader.applicationContext, Uri.fromFile(new File(videoPath))); + if (mp == null) { + return; + } + video.duration = (int) Math.ceil(mp.getDuration() / 1000.0f); + video.w = mp.getVideoWidth(); + video.h = mp.getVideoHeight(); + mp.release(); } - video.duration = (int) Math.ceil(mp.getDuration() / 1000.0f); - video.w = mp.getVideoWidth(); - video.h = mp.getVideoHeight(); - mp.release(); } video.path = videoPath; @@ -1730,7 +1756,7 @@ public class ChatActivity extends BaseFragment implements NotificationCenter.Not if (minDate == 0 || obj.messageOwner.date < minDate) { minDate = obj.messageOwner.date; } - if (!obj.isOut() && obj.messageOwner.unread) { + if (!obj.isOut() && obj.isUnread()) { wasUnread = true; } messagesDict.put(obj.messageOwner.id, obj); @@ -1948,7 +1974,7 @@ public class ChatActivity extends BaseFragment implements NotificationCenter.Not currentMinMsgId = Math.min(obj.messageOwner.id, currentMinMsgId); } - if (!obj.isOut() && obj.messageOwner.unread) { + if (!obj.isOut() && obj.isUnread()) { unread_to_load++; currentMarkAsRead = true; } @@ -2013,7 +2039,7 @@ public class ChatActivity extends BaseFragment implements NotificationCenter.Not dateObj.contentType = 7; messages.add(0, dateObj); } - if (!obj.isOut() && obj.messageOwner.unread) { + if (!obj.isOut() && obj.isUnread()) { if (!paused) { obj.messageOwner.unread = false; } @@ -2076,7 +2102,7 @@ public class ChatActivity extends BaseFragment implements NotificationCenter.Not } } else if (id == NotificationCenter.closeChats) { removeSelfFromStack(); - } else if (id == NotificationCenter.messagesReaded) { + } else if (id == NotificationCenter.messagesRead) { ArrayList markAsReadMessages = (ArrayList)args[0]; boolean updated = false; for (Integer ids : markAsReadMessages) { @@ -2397,7 +2423,9 @@ public class ChatActivity extends BaseFragment implements NotificationCenter.Not if (!messageObject.isUnread() && !messageObject.isFromMe()) { break; } - messageObject.messageOwner.unread = false; + if (!messageObject.isOut()) { + messageObject.messageOwner.unread = false; + } } readWhenResume = false; MessagesController.getInstance().markDialogAsRead(dialog_id, messages.get(0).messageOwner.id, readWithMid, 0, readWithDate, true, false); diff --git a/TMessagesProj/src/main/java/org/telegram/ui/ContactsActivity.java b/TMessagesProj/src/main/java/org/telegram/ui/ContactsActivity.java index b3d3db48c..7a54a4109 100644 --- a/TMessagesProj/src/main/java/org/telegram/ui/ContactsActivity.java +++ b/TMessagesProj/src/main/java/org/telegram/ui/ContactsActivity.java @@ -419,7 +419,7 @@ public class ContactsActivity extends BaseFragment implements NotificationCenter if (!updatingInviteText) { updatingInviteText = true; TLRPC.TL_help_getInviteText req = new TLRPC.TL_help_getInviteText(); - req.lang_code = Locale.getDefault().getCountry(); + req.lang_code = LocaleController.getLocaleString(Locale.getDefault()); if (req.lang_code == null || req.lang_code.length() == 0) { req.lang_code = "en"; } diff --git a/TMessagesProj/src/main/java/org/telegram/ui/LaunchActivity.java b/TMessagesProj/src/main/java/org/telegram/ui/LaunchActivity.java index 5b322ec4c..021a148ef 100644 --- a/TMessagesProj/src/main/java/org/telegram/ui/LaunchActivity.java +++ b/TMessagesProj/src/main/java/org/telegram/ui/LaunchActivity.java @@ -426,6 +426,13 @@ public class LaunchActivity extends ActionBarActivity implements NotificationCen pushOpened = true; } if (!pushOpened && !isNew) { + if (fragmentsStack.isEmpty()) { + if (!UserConfig.isClientActivated()) { + addFragmentToStack(new LoginActivity()); + } else { + addFragmentToStack(new MessagesActivity(null)); + } + } showLastFragment(); } @@ -463,7 +470,7 @@ public class LaunchActivity extends ActionBarActivity implements NotificationCen ChatActivity fragment = new ChatActivity(args); presentFragment(fragment, true); if (videoPath != null) { - fragment.processSendingVideo(videoPath); + fragment.processSendingVideo(videoPath, null, 0, 0, 0, 0); } if (sendingText != null) { fragment.processSendingText(sendingText); diff --git a/TMessagesProj/src/main/java/org/telegram/ui/LoginActivityPhoneView.java b/TMessagesProj/src/main/java/org/telegram/ui/LoginActivityPhoneView.java index 90299fbaf..79d0926cf 100644 --- a/TMessagesProj/src/main/java/org/telegram/ui/LoginActivityPhoneView.java +++ b/TMessagesProj/src/main/java/org/telegram/ui/LoginActivityPhoneView.java @@ -31,7 +31,6 @@ import org.telegram.messenger.ConnectionsManager; import org.telegram.messenger.FileLog; import org.telegram.messenger.R; import org.telegram.messenger.RPCRequest; -import org.telegram.messenger.Utilities; import org.telegram.ui.Views.ActionBar.BaseFragment; import org.telegram.ui.Views.SlideView; @@ -83,6 +82,9 @@ public class LoginActivityPhoneView extends SlideView implements AdapterView.OnI countryButton.setOnClickListener(new OnClickListener() { @Override public void onClick(View view) { + if (delegate == null) { + return; + } BaseFragment activity = (BaseFragment)delegate; CountrySelectActivity fragment = new CountrySelectActivity(); fragment.setCountrySelectActivityDelegate(new CountrySelectActivity.CountrySelectActivityDelegate() { @@ -342,7 +344,7 @@ public class LoginActivityPhoneView extends SlideView implements AdapterView.OnI req.api_id = BuildVars.APP_ID; req.sms_type = 0; req.phone_number = phone; - req.lang_code = Locale.getDefault().getCountry(); + req.lang_code = LocaleController.getLocaleString(Locale.getDefault()); if (req.lang_code == null || req.lang_code.length() == 0) { req.lang_code = "en"; } @@ -351,7 +353,9 @@ public class LoginActivityPhoneView extends SlideView implements AdapterView.OnI params.putString("phone", "+" + codeField.getText() + phoneField.getText()); params.putString("phoneFormated", phone); nextPressed = true; - delegate.needShowProgress(); + if (delegate != null) { + delegate.needShowProgress(); + } ConnectionsManager.getInstance().performRpc(req, new RPCRequest.RPCRequestDelegate() { @Override public void run(final TLObject response, final TLRPC.TL_error error) { diff --git a/TMessagesProj/src/main/java/org/telegram/ui/LoginActivitySmsView.java b/TMessagesProj/src/main/java/org/telegram/ui/LoginActivitySmsView.java index 7d91124ad..ace4d2365 100644 --- a/TMessagesProj/src/main/java/org/telegram/ui/LoginActivitySmsView.java +++ b/TMessagesProj/src/main/java/org/telegram/ui/LoginActivitySmsView.java @@ -35,7 +35,6 @@ import org.telegram.android.NotificationCenter; import org.telegram.messenger.R; import org.telegram.messenger.RPCRequest; import org.telegram.messenger.UserConfig; -import org.telegram.messenger.Utilities; import org.telegram.ui.Views.SlideView; import java.util.ArrayList; diff --git a/TMessagesProj/src/main/java/org/telegram/ui/PopupNotificationActivity.java b/TMessagesProj/src/main/java/org/telegram/ui/PopupNotificationActivity.java index 18c5c2b36..2d4bef557 100644 --- a/TMessagesProj/src/main/java/org/telegram/ui/PopupNotificationActivity.java +++ b/TMessagesProj/src/main/java/org/telegram/ui/PopupNotificationActivity.java @@ -158,7 +158,9 @@ public class PopupNotificationActivity extends Activity implements NotificationC if (currentMessageObject == null) { return; } - NotificationsController.getInstance().popupMessages.remove(currentMessageNum); + if (currentMessageNum >= 0 && currentMessageNum < NotificationsController.getInstance().popupMessages.size()) { + NotificationsController.getInstance().popupMessages.remove(currentMessageNum); + } MessagesController.getInstance().markDialogAsRead(currentMessageObject.getDialogId(), currentMessageObject.messageOwner.id, Math.max(0, currentMessageObject.messageOwner.id), 0, currentMessageObject.messageOwner.date, true, true); currentMessageObject = null; getNewMessage(); diff --git a/TMessagesProj/src/main/java/org/telegram/ui/VideoEditorActivity.java b/TMessagesProj/src/main/java/org/telegram/ui/VideoEditorActivity.java index 04ceef4b1..b72986a39 100644 --- a/TMessagesProj/src/main/java/org/telegram/ui/VideoEditorActivity.java +++ b/TMessagesProj/src/main/java/org/telegram/ui/VideoEditorActivity.java @@ -8,7 +8,13 @@ package org.telegram.ui; +import android.annotation.TargetApi; import android.content.res.Configuration; +import android.media.MediaCodec; +import android.media.MediaCodecInfo; +import android.media.MediaExtractor; +import android.media.MediaFormat; +import android.media.MediaMetadataRetriever; import android.media.MediaPlayer; import android.os.Bundle; import android.view.LayoutInflater; @@ -21,15 +27,23 @@ import android.widget.FrameLayout; import android.widget.ImageView; import android.widget.TextView; +import com.coremedia.iso.IsoFile; import com.coremedia.iso.boxes.Container; +import com.coremedia.iso.boxes.TrackBox; +import com.coremedia.iso.boxes.h264.AvcConfigurationBox; import com.googlecode.mp4parser.authoring.Movie; import com.googlecode.mp4parser.authoring.Track; import com.googlecode.mp4parser.authoring.builder.DefaultMp4Builder; import com.googlecode.mp4parser.authoring.container.mp4.MovieCreator; import com.googlecode.mp4parser.authoring.tracks.CroppedTrack; +import com.googlecode.mp4parser.util.Path; import org.telegram.android.AndroidUtilities; import org.telegram.android.LocaleController; +import org.telegram.android.video.InputSurface; +import org.telegram.android.video.MP4Builder; +import org.telegram.android.video.Mp4Movie; +import org.telegram.android.video.OutputSurface; import org.telegram.messenger.FileLog; import org.telegram.messenger.R; import org.telegram.messenger.UserConfig; @@ -42,13 +56,22 @@ import org.telegram.ui.Views.VideoTimelineView; import java.io.File; import java.io.FileOutputStream; +import java.nio.ByteBuffer; import java.nio.channels.FileChannel; +import java.util.ArrayList; import java.util.Arrays; import java.util.LinkedList; import java.util.List; +@TargetApi(18) public class VideoEditorActivity extends BaseFragment implements SurfaceHolder.Callback { + private final static int OMX_TI_COLOR_FormatYUV420PackedSemiPlanar = 0x7F000100; + private final static int OMX_QCOM_COLOR_FormatYVU420SemiPlanar = 0x7FA30C00; + private final static int OMX_QCOM_COLOR_FormatYUV420PackedSemiPlanar64x32Tile2m8ka = 0x7FA30C03; + private final static int OMX_SEC_COLOR_FormatNV12Tiled = 0x7FC00002; + private final static int OMX_QCOM_COLOR_FormatYUV420PackedSemiPlanar32m = 0x7FA30C04; + private MediaPlayer videoPlayer = null; private SurfaceHolder surfaceHolder = null; private VideoTimelineView videoTimelineView = null; @@ -63,12 +86,17 @@ public class VideoEditorActivity extends BaseFragment implements SurfaceHolder.C private String videoPath = null; private int videoWidth; private int videoHeight; + private int editedVideoWidth; + private int editedVideoHeight; + private int editedVideoDuration; private float lastProgress = 0; private boolean needSeek = false; private VideoEditorActivityDelegate delegate; + private long esimatedFileSize = 0; public interface VideoEditorActivityDelegate { - public abstract void didFinishedVideoConverting(String videoPath); + public abstract void didStartVideoConverting(String videoPath, String originalPath, long esimatedSize, int duration, int width, int height); + public abstract void didAppenedVideoData(String videoPath, long finalSize); } private Runnable progressRunnable = new Runnable() { @@ -156,7 +184,8 @@ public class VideoEditorActivity extends BaseFragment implements SurfaceHolder.C finishFragment(); } else if (id == 1) { try { - startConvert(); + //startConvert(); + VideoEditWrapper.runTest(VideoEditorActivity.this); } catch (Exception e) { FileLog.e("tmessages", e); } @@ -167,16 +196,16 @@ public class VideoEditorActivity extends BaseFragment implements SurfaceHolder.C ActionBarMenu menu = actionBarLayer.createMenu(); View doneItem = menu.addItemResource(1, R.layout.group_create_done_layout); - TextView doneTextView = (TextView)doneItem.findViewById(R.id.done_button); + TextView doneTextView = (TextView) doneItem.findViewById(R.id.done_button); doneTextView.setText(LocaleController.getString("Done", R.string.Done).toUpperCase()); fragmentView = inflater.inflate(R.layout.video_editor_layout, container, false); - originalSizeTextView = (TextView)fragmentView.findViewById(R.id.original_size); - editedSizeTextView = (TextView)fragmentView.findViewById(R.id.edited_size); + originalSizeTextView = (TextView) fragmentView.findViewById(R.id.original_size); + editedSizeTextView = (TextView) fragmentView.findViewById(R.id.edited_size); videoContainerView = fragmentView.findViewById(R.id.video_container); textContainerView = fragmentView.findViewById(R.id.info_container); - videoTimelineView = (VideoTimelineView)fragmentView.findViewById(R.id.video_timeline_view); + videoTimelineView = (VideoTimelineView) fragmentView.findViewById(R.id.video_timeline_view); videoTimelineView.setVideoPath(videoPath); videoTimelineView.setDelegate(new VideoTimelineView.VideoTimelineViewDelegate() { @Override @@ -187,7 +216,7 @@ public class VideoEditorActivity extends BaseFragment implements SurfaceHolder.C playButton.setImageResource(R.drawable.video_play); } videoPlayer.setOnSeekCompleteListener(null); - videoPlayer.seekTo((int)(videoPlayer.getDuration() * progress)); + videoPlayer.seekTo((int) (videoPlayer.getDuration() * progress)); } catch (Exception e) { FileLog.e("tmessages", e); } @@ -204,7 +233,7 @@ public class VideoEditorActivity extends BaseFragment implements SurfaceHolder.C playButton.setImageResource(R.drawable.video_play); } videoPlayer.setOnSeekCompleteListener(null); - videoPlayer.seekTo((int)(videoPlayer.getDuration() * progress)); + videoPlayer.seekTo((int) (videoPlayer.getDuration() * progress)); } catch (Exception e) { FileLog.e("tmessages", e); } @@ -214,14 +243,14 @@ public class VideoEditorActivity extends BaseFragment implements SurfaceHolder.C } }); - videoSeekBarView = (VideoSeekBarView)fragmentView.findViewById(R.id.video_seekbar); + videoSeekBarView = (VideoSeekBarView) fragmentView.findViewById(R.id.video_seekbar); videoSeekBarView.delegate = new VideoSeekBarView.SeekBarDelegate() { @Override public void onSeekBarDrag(float progress) { if (videoPlayer.isPlaying()) { try { float prog = videoTimelineView.getLeftProgress() + (videoTimelineView.getRightProgress() - videoTimelineView.getLeft()) * progress; - videoPlayer.seekTo((int)(videoPlayer.getDuration() * prog)); + videoPlayer.seekTo((int) (videoPlayer.getDuration() * prog)); lastProgress = progress; } catch (Exception e) { FileLog.e("tmessages", e); @@ -233,7 +262,7 @@ public class VideoEditorActivity extends BaseFragment implements SurfaceHolder.C } }; - playButton = (ImageView)fragmentView.findViewById(R.id.play_button); + playButton = (ImageView) fragmentView.findViewById(R.id.play_button); playButton.setOnClickListener(new View.OnClickListener() { @Override public void onClick(View v) { @@ -253,7 +282,7 @@ public class VideoEditorActivity extends BaseFragment implements SurfaceHolder.C updateVideoOriginalInfo(); updateVideoEditedInfo(); } else { - ViewGroup parent = (ViewGroup)fragmentView.getParent(); + ViewGroup parent = (ViewGroup) fragmentView.getParent(); if (parent != null) { parent.removeView(fragmentView); } @@ -318,7 +347,7 @@ public class VideoEditorActivity extends BaseFragment implements SurfaceHolder.C File file = new File(videoPath); String videoDimension = String.format("%dx%d", videoPlayer.getVideoWidth(), videoPlayer.getVideoHeight()); int minutes = videoPlayer.getDuration() / 1000 / 60; - int seconds = (int)Math.ceil(videoPlayer.getDuration() / 1000) - minutes * 60; + int seconds = (int) Math.ceil(videoPlayer.getDuration() / 1000) - minutes * 60; String videoTimeSize = String.format("%d:%02d, %s", minutes, seconds, Utilities.formatFileSize(file.length())); originalSizeTextView.setText(String.format("%s: %s, %s", LocaleController.getString("OriginalVideo", R.string.OriginalVideo), videoDimension, videoTimeSize)); } @@ -329,18 +358,20 @@ public class VideoEditorActivity extends BaseFragment implements SurfaceHolder.C } File file = new File(videoPath); long size = file.length(); - float videoWidth = videoPlayer.getVideoWidth(); - float videoHeight = videoPlayer.getVideoHeight(); - if (videoWidth > 640 || videoHeight > 640) { - float scale = videoWidth > videoHeight ? 640.0f / videoWidth : 640.0f / videoHeight; - videoWidth *= scale; - videoHeight *= scale; - size *= (scale * scale); + editedVideoWidth = videoPlayer.getVideoWidth(); + editedVideoHeight = videoPlayer.getVideoHeight(); + if (editedVideoWidth > 640 || editedVideoHeight > 640) { + float scale = editedVideoWidth > editedVideoHeight ? 640.0f / editedVideoWidth : 640.0f / editedVideoHeight; + editedVideoWidth *= scale; + editedVideoHeight *= scale; + size *= (scale * scale) * 1.02f; } - String videoDimension = String.format("%dx%d", (int)videoWidth, (int)videoHeight); - int minutes = videoPlayer.getDuration() / 1000 / 60; - int seconds = (int)Math.ceil(videoPlayer.getDuration() / 1000) - minutes * 60; + String videoDimension = String.format("%dx%d", editedVideoWidth, editedVideoHeight); + editedVideoDuration = videoPlayer.getDuration(); + int minutes = editedVideoDuration / 1000 / 60; + int seconds = (int) Math.ceil(editedVideoDuration / 1000) - minutes * 60; String videoTimeSize = String.format("%d:%02d, ~%s", minutes, seconds, Utilities.formatFileSize(size)); + esimatedFileSize = size; editedSizeTextView.setText(String.format("%s: %s, %s", LocaleController.getString("EditedVideo", R.string.EditedVideo), videoDimension, videoTimeSize)); } @@ -365,9 +396,9 @@ public class VideoEditorActivity extends BaseFragment implements SurfaceHolder.C height = viewHeight - AndroidUtilities.dp(176); } - float wr = (float)width / (float)videoWidth; - float hr = (float)height / (float)videoHeight; - float ar = (float)videoWidth / (float)videoHeight; + float wr = (float) width / (float) videoWidth; + float hr = (float) height / (float) videoHeight; + float ar = (float) videoWidth / (float) videoHeight; if (wr > hr) { width = (int) (height * ar); @@ -387,14 +418,14 @@ public class VideoEditorActivity extends BaseFragment implements SurfaceHolder.C public boolean onPreDraw() { originalSizeTextView.getViewTreeObserver().removeOnPreDrawListener(this); if (getParentActivity().getResources().getConfiguration().orientation == Configuration.ORIENTATION_LANDSCAPE) { - FrameLayout.LayoutParams layoutParams = (FrameLayout.LayoutParams)videoContainerView.getLayoutParams(); + FrameLayout.LayoutParams layoutParams = (FrameLayout.LayoutParams) videoContainerView.getLayoutParams(); layoutParams.topMargin = AndroidUtilities.dp(16); layoutParams.bottomMargin = AndroidUtilities.dp(16); layoutParams.width = AndroidUtilities.displaySize.x / 2 - AndroidUtilities.dp(24); layoutParams.leftMargin = AndroidUtilities.dp(16); videoContainerView.setLayoutParams(layoutParams); - layoutParams = (FrameLayout.LayoutParams)textContainerView.getLayoutParams(); + layoutParams = (FrameLayout.LayoutParams) textContainerView.getLayoutParams(); layoutParams.height = FrameLayout.LayoutParams.MATCH_PARENT; layoutParams.width = AndroidUtilities.displaySize.x / 2 - AndroidUtilities.dp(24); layoutParams.leftMargin = AndroidUtilities.displaySize.x / 2 + AndroidUtilities.dp(8); @@ -402,14 +433,14 @@ public class VideoEditorActivity extends BaseFragment implements SurfaceHolder.C layoutParams.topMargin = AndroidUtilities.dp(16); textContainerView.setLayoutParams(layoutParams); } else { - FrameLayout.LayoutParams layoutParams = (FrameLayout.LayoutParams)videoContainerView.getLayoutParams(); + FrameLayout.LayoutParams layoutParams = (FrameLayout.LayoutParams) videoContainerView.getLayoutParams(); layoutParams.topMargin = AndroidUtilities.dp(16); layoutParams.bottomMargin = AndroidUtilities.dp(160); layoutParams.width = FrameLayout.LayoutParams.MATCH_PARENT; layoutParams.leftMargin = 0; videoContainerView.setLayoutParams(layoutParams); - layoutParams = (FrameLayout.LayoutParams)textContainerView.getLayoutParams(); + layoutParams = (FrameLayout.LayoutParams) textContainerView.getLayoutParams(); layoutParams.height = AndroidUtilities.dp(143); layoutParams.width = FrameLayout.LayoutParams.MATCH_PARENT; layoutParams.leftMargin = 0; @@ -434,7 +465,7 @@ public class VideoEditorActivity extends BaseFragment implements SurfaceHolder.C lastProgress = 0; if (needSeek) { float prog = videoTimelineView.getLeftProgress() + (videoTimelineView.getRightProgress() - videoTimelineView.getLeft()) * videoSeekBarView.getProgress(); - videoPlayer.seekTo((int)(videoPlayer.getDuration() * prog)); + videoPlayer.seekTo((int) (videoPlayer.getDuration() * prog)); needSeek = false; } videoPlayer.setOnSeekCompleteListener(new MediaPlayer.OnSeekCompleteListener() { @@ -461,7 +492,394 @@ public class VideoEditorActivity extends BaseFragment implements SurfaceHolder.C this.delegate = delegate; } + private int selectTrack(MediaExtractor extractor, boolean audio) { + int numTracks = extractor.getTrackCount(); + for (int i = 0; i < numTracks; i++) { + MediaFormat format = extractor.getTrackFormat(i); + String mime = format.getString(MediaFormat.KEY_MIME); + if (audio) { + if (mime.startsWith("audio/")) { + return i; + } + } else { + if (mime.startsWith("video/")) { + return i; + } + } + } + return -5; + } + + private static class VideoEditWrapper implements Runnable { + private VideoEditorActivity mTest; + private VideoEditWrapper(VideoEditorActivity test) { + mTest = test; + } + + @Override + public void run() { + mTest.startConvert2(); + } + + public static void runTest(final VideoEditorActivity obj) { + new Thread(new Runnable() { + @Override + public void run() { + try { + VideoEditWrapper wrapper = new VideoEditWrapper(obj); + Thread th = new Thread(wrapper, "encoder"); + th.start(); + th.join(); + } catch (Exception e) { + FileLog.e("tmessages", e); + } + } + }).start(); + } + } + + private void didWriteData(final String videoPath, final boolean first, final long finalSize) { + AndroidUtilities.RunOnUIThread(new Runnable() { + @Override + public void run() { + if (first) { + delegate.didStartVideoConverting(videoPath, VideoEditorActivity.this.videoPath, esimatedFileSize, editedVideoDuration, editedVideoWidth, editedVideoHeight); + } else { + delegate.didAppenedVideoData(videoPath, finalSize); + } + } + }); + } + + private boolean startConvert2() { + MediaCodec decoder = null; + MediaCodec encoder = null; + MediaExtractor extractor = null; + InputSurface inputSurface = null; + OutputSurface outputSurface = null; + MP4Builder mediaMuxer = null; + File cacheFile = null; + long time = System.currentTimeMillis(); + boolean finished = true; + boolean firstWrite = true; + + class AudioBufferTemp { + ByteBuffer buffer; + int flags; + int size; + long presentationTimeUs; + } + + try { + File inputFile = new File(videoPath); + if (!inputFile.canRead()) { + return false; + } + + boolean outputDone = false; + boolean inputDone = false; + boolean decoderDone = false; + boolean muxerStarted = false; + int videoTrackIndex = -5; + int audioTrackIndex = -5; + int audioIndex = -5; + int videoIndex = -5; + int audioBufferSize = 0; + ByteBuffer audioBuffer = null; + ArrayList audioBuffers = new ArrayList(); + + MediaMetadataRetriever mediaMetadataRetriever = new MediaMetadataRetriever(); + mediaMetadataRetriever.setDataSource(inputFile.toString()); + String rotation = mediaMetadataRetriever.extractMetadata(MediaMetadataRetriever.METADATA_KEY_VIDEO_ROTATION); + int rotationValue = 0; + if (rotation != null) { + try { + rotationValue = Integer.parseInt(rotation); + } catch (Exception e) { + //don't promt + } + } + + extractor = new MediaExtractor(); + extractor.setDataSource(inputFile.toString()); + + String fileName = Integer.MIN_VALUE + "_" + UserConfig.lastLocalId + ".mp4"; + UserConfig.lastLocalId--; + cacheFile = new File(AndroidUtilities.getCacheDir(), fileName); + UserConfig.saveConfig(false); + + Mp4Movie movie = new Mp4Movie(); + movie.setCacheFile(cacheFile); + movie.setRotation(rotationValue); + movie.setSize(640, 360); + mediaMuxer = new MP4Builder().createMovie(movie); + + videoIndex = selectTrack(extractor, false); + if (videoIndex < 0) { + return false; + } + extractor.selectTrack(videoIndex); + MediaFormat inputFormat = extractor.getTrackFormat(videoIndex); + String mime = inputFormat.getString(MediaFormat.KEY_MIME); + + audioIndex = selectTrack(extractor, true); + if (audioIndex >= 0) { + extractor.selectTrack(audioIndex); + MediaFormat audioFormat = extractor.getTrackFormat(audioIndex); + audioTrackIndex = mediaMuxer.addTrack(audioFormat, false); + audioBufferSize = audioFormat.getInteger(MediaFormat.KEY_MAX_INPUT_SIZE); + } + + MediaFormat outputFormat = MediaFormat.createVideoFormat(mime, 640, 360); + outputFormat.setInteger(MediaFormat.KEY_COLOR_FORMAT, MediaCodecInfo.CodecCapabilities.COLOR_FormatSurface); + outputFormat.setInteger(MediaFormat.KEY_BIT_RATE, 1000000); + outputFormat.setInteger(MediaFormat.KEY_FRAME_RATE, 25); + outputFormat.setInteger(MediaFormat.KEY_I_FRAME_INTERVAL, 1); + + encoder = MediaCodec.createEncoderByType(mime); + encoder.configure(outputFormat, null, null, MediaCodec.CONFIGURE_FLAG_ENCODE); + inputSurface = new InputSurface(encoder.createInputSurface()); + inputSurface.makeCurrent(); + encoder.start(); + + decoder = MediaCodec.createDecoderByType(mime); + outputSurface = new OutputSurface(); + decoder.configure(inputFormat, outputSurface.getSurface(), null, 0); + decoder.start(); + + final int TIMEOUT_USEC = 10000; + ByteBuffer[] decoderInputBuffers = decoder.getInputBuffers(); + ByteBuffer[] encoderOutputBuffers = encoder.getOutputBuffers(); + MediaCodec.BufferInfo info = new MediaCodec.BufferInfo(); + + while (!outputDone) { + if (!inputDone) { + boolean eof = false; + int index = extractor.getSampleTrackIndex(); + if (index == videoIndex) { + int inputBufIndex = decoder.dequeueInputBuffer(TIMEOUT_USEC); + if (inputBufIndex >= 0) { + ByteBuffer inputBuf = decoderInputBuffers[inputBufIndex]; + int chunkSize = extractor.readSampleData(inputBuf, 0); + if (chunkSize < 0) { + decoder.queueInputBuffer(inputBufIndex, 0, 0, 0L, MediaCodec.BUFFER_FLAG_END_OF_STREAM); + inputDone = true; + } else { + decoder.queueInputBuffer(inputBufIndex, 0, chunkSize, extractor.getSampleTime(), 0); + extractor.advance(); + } + } + } else if (index == audioIndex) { + if (audioBuffer == null) { + audioBuffer = ByteBuffer.allocate(audioBufferSize); + } + info.size = extractor.readSampleData(audioBuffer, 0); + if (info.size < 0) { + info.size = 0; + eof = true; + } else { + if (muxerStarted) { + info.offset = 0; + info.presentationTimeUs = extractor.getSampleTime(); + info.flags = extractor.getSampleFlags(); + mediaMuxer.writeSampleData(audioTrackIndex, audioBuffer, info); + } else { + AudioBufferTemp audioBufferTemp = new AudioBufferTemp(); + audioBufferTemp.buffer = audioBuffer; + audioBufferTemp.presentationTimeUs = extractor.getSampleTime(); + audioBufferTemp.flags = extractor.getSampleFlags(); + audioBufferTemp.size = info.size; + audioBuffers.add(audioBufferTemp); + audioBuffer = null; + } + extractor.advance(); + } + } else if (index == -1) { + eof = true; + } + if (eof) { + int inputBufIndex = decoder.dequeueInputBuffer(TIMEOUT_USEC); + if (inputBufIndex >= 0) { + decoder.queueInputBuffer(inputBufIndex, 0, 0, 0L, MediaCodec.BUFFER_FLAG_END_OF_STREAM); + inputDone = true; + } + } + } + + boolean decoderOutputAvailable = !decoderDone; + boolean encoderOutputAvailable = true; + while (decoderOutputAvailable || encoderOutputAvailable) { + int encoderStatus = encoder.dequeueOutputBuffer(info, TIMEOUT_USEC); + if (encoderStatus == MediaCodec.INFO_TRY_AGAIN_LATER) { + encoderOutputAvailable = false; + } else if (encoderStatus == MediaCodec.INFO_OUTPUT_BUFFERS_CHANGED) { + encoderOutputBuffers = encoder.getOutputBuffers(); + } else if (encoderStatus == MediaCodec.INFO_OUTPUT_FORMAT_CHANGED) { + MediaFormat newFormat = encoder.getOutputFormat(); + if (muxerStarted) { + throw new RuntimeException("format changed twice"); + } + videoTrackIndex = mediaMuxer.addTrack(newFormat, true); + + muxerStarted = true; + if (!audioBuffers.isEmpty()) { + for (AudioBufferTemp audioBufferTemp : audioBuffers) { + info.size = audioBufferTemp.size; + info.offset = 0; + info.presentationTimeUs = audioBufferTemp.presentationTimeUs; + info.flags = audioBufferTemp.flags; + mediaMuxer.writeSampleData(audioTrackIndex, audioBufferTemp.buffer, info); + } + audioBuffers.clear(); + } + } else if (encoderStatus < 0) { + FileLog.e("tmessages", "unexpected result from encoder.dequeueOutputBuffer: " + encoderStatus); + return false; + } else { + ByteBuffer encodedData = encoderOutputBuffers[encoderStatus]; + if (encodedData == null) { + FileLog.e("tmessages", "encoderOutputBuffer " + encoderStatus + " was null"); + return false; + } + if (info.size != 0) { + if (!muxerStarted) { + throw new RuntimeException("muxer hasn't started"); + } + if ((info.flags & MediaCodec.BUFFER_FLAG_CODEC_CONFIG) == 0) { + encodedData.limit(info.size); + encodedData.position(info.offset); + encodedData.putInt(Integer.reverseBytes(info.size - 4)); + mediaMuxer.writeSampleData(videoTrackIndex, encodedData, info); + didWriteData(cacheFile.toString(), firstWrite, 0); + if (firstWrite) { + firstWrite = false; + } + } + } + outputDone = (info.flags & MediaCodec.BUFFER_FLAG_END_OF_STREAM) != 0; + encoder.releaseOutputBuffer(encoderStatus, false); + } + if (encoderStatus != MediaCodec.INFO_TRY_AGAIN_LATER) { + continue; + } + + if (!decoderDone) { + int decoderStatus = decoder.dequeueOutputBuffer(info, TIMEOUT_USEC); + if (decoderStatus == MediaCodec.INFO_TRY_AGAIN_LATER) { + decoderOutputAvailable = false; + } else if (decoderStatus == MediaCodec.INFO_OUTPUT_BUFFERS_CHANGED) { + + } else if (decoderStatus == MediaCodec.INFO_OUTPUT_FORMAT_CHANGED) { + MediaFormat newFormat = decoder.getOutputFormat(); + } else if (decoderStatus < 0) { + FileLog.e("tmessages", "unexpected result from decoder.dequeueOutputBuffer: " + decoderStatus); + return false; + } else { + boolean doRender = (info.size != 0); + decoder.releaseOutputBuffer(decoderStatus, doRender); + if (doRender) { + outputSurface.awaitNewImage(); + outputSurface.drawImage(); + inputSurface.setPresentationTime(info.presentationTimeUs * 1000); + inputSurface.swapBuffers(); + } + if ((info.flags & MediaCodec.BUFFER_FLAG_END_OF_STREAM) != 0) { + FileLog.e("tmessages", "signaling input EOS"); + //if (WORK_AROUND_BUGS) { + // Bail early, possibly dropping a frame. + // return; + //} else { + encoder.signalEndOfInputStream(); + //} + } + } + } + } + + /*if (!outputDone) { without surface + int decoderStatus = decoder.dequeueOutputBuffer(info, TIMEOUT_USEC); + if (decoderStatus == MediaCodec.INFO_TRY_AGAIN_LATER) { + FileLog.e("tmessages", "no output from decoder available"); + } else if (decoderStatus == MediaCodec.INFO_OUTPUT_BUFFERS_CHANGED) { + FileLog.e("tmessages", "decoder output buffers changed"); + decoderOutputBuffers = decoder.getOutputBuffers(); + } else if (decoderStatus == MediaCodec.INFO_OUTPUT_FORMAT_CHANGED) { + decoderOutputFormat = decoder.getOutputFormat(); + FileLog.e("tmessages", "decoder output format changed: " + decoderOutputFormat); + } else if (decoderStatus < 0) { + FileLog.e("tmessages", "unexpected result from decoder.dequeueOutputBuffer: " + decoderStatus); + return false; + } else { + ByteBuffer outputFrame = decoderOutputBuffers[decoderStatus]; + outputFrame.position(info.offset); + outputFrame.limit(info.offset + info.size); + if (info.size == 0) { + FileLog.e("tmessages", "got empty frame"); + } else { + FileLog.e("tmessages", "decoded, checking frame format = " + decoderOutputFormat + " size = " + outputFrame.limit()); + } + if ((info.flags & MediaCodec.BUFFER_FLAG_END_OF_STREAM) != 0) { + FileLog.e("tmessages", "output EOS"); + outputDone = true; + } + decoder.releaseOutputBuffer(decoderStatus, false); + } + }*/ + } + } catch (Exception e) { + FileLog.e("tmessages", e); + finished = false; + } finally { + if (outputSurface != null) { + outputSurface.release(); + outputSurface = null; + } + if (inputSurface != null) { + inputSurface.release(); + inputSurface = null; + } + if (decoder != null) { + decoder.stop(); + decoder.release(); + decoder = null; + } + if (encoder != null) { + encoder.stop(); + encoder.release(); + encoder = null; + } + if (extractor != null) { + extractor.release(); + extractor = null; + } + if (mediaMuxer != null) { + try { + mediaMuxer.finishMovie(false); + } catch (Exception e) { + FileLog.e("tmessages", e); + } + mediaMuxer = null; + } + FileLog.e("tmessages", "time = " + (System.currentTimeMillis() - time)); + } + if (finished) { + didWriteData(cacheFile.toString(), firstWrite, cacheFile.length()); + } + AndroidUtilities.RunOnUIThread(new Runnable() { + @Override + public void run() { + finishFragment(); + } + }); + return finished; + } + private void startConvert() throws Exception { + IsoFile isoFile = new IsoFile(videoPath); + TrackBox trackBox = (TrackBox) Path.getPath(isoFile, "/moov/trak/mdia/minf/stbl/stsd/avc1/../../../../../"); + AvcConfigurationBox avcConfigurationBox = (AvcConfigurationBox) Path.getPath(trackBox, "mdia/minf/stbl/stsd/avc1/avcC"); + avcConfigurationBox.parseDetails(); + Movie movie = MovieCreator.build(videoPath); List tracks = movie.getTracks(); @@ -472,7 +890,7 @@ public class VideoEditorActivity extends BaseFragment implements SurfaceHolder.C for (Track track : tracks) { if (track.getSyncSamples() != null && track.getSyncSamples().length > 0) { - double duration = (double)track.getDuration() / (double)track.getTrackMetaData().getTimescale(); + double duration = (double) track.getDuration() / (double) track.getTrackMetaData().getTimescale(); startTime = correctTimeToSyncSample(track, videoTimelineView.getLeftProgress() * duration, false); endTime = videoTimelineView.getRightProgress() * duration; break; @@ -514,16 +932,11 @@ public class VideoEditorActivity extends BaseFragment implements SurfaceHolder.C fc.close(); fos.close(); if (delegate != null) { - delegate.didFinishedVideoConverting(cacheFile.getAbsolutePath()); + //delegate.didFinishedVideoConverting(cacheFile.getAbsolutePath()); finishFragment(); } } -// private void startEncodeVideo() { -// MediaExtractor mediaExtractor = new MediaExtractor(); -// mediaExtractor.s -// } - private static double correctTimeToSyncSample(Track track, double cutHere, boolean next) { double[] timeOfSyncSamples = new double[track.getSyncSamples().length]; long currentSample = 0; diff --git a/TMessagesProj/src/main/java/org/telegram/ui/Views/ActionBar/ActionBarActivity.java b/TMessagesProj/src/main/java/org/telegram/ui/Views/ActionBar/ActionBarActivity.java index 8aaee775a..2cde70995 100644 --- a/TMessagesProj/src/main/java/org/telegram/ui/Views/ActionBar/ActionBarActivity.java +++ b/TMessagesProj/src/main/java/org/telegram/ui/Views/ActionBar/ActionBarActivity.java @@ -619,6 +619,9 @@ public class ActionBarActivity extends Activity { } public void showLastFragment() { + if (fragmentsStack.isEmpty()) { + return; + } BaseFragment previousFragment = fragmentsStack.get(fragmentsStack.size() - 1); previousFragment.setParentActivity(this); View fragmentView = previousFragment.createView(getLayoutInflater(), null); diff --git a/TMessagesProj/src/main/java/org/telegram/ui/Views/ChatActivityEnterView.java b/TMessagesProj/src/main/java/org/telegram/ui/Views/ChatActivityEnterView.java index d6375febc..69d7542ba 100644 --- a/TMessagesProj/src/main/java/org/telegram/ui/Views/ChatActivityEnterView.java +++ b/TMessagesProj/src/main/java/org/telegram/ui/Views/ChatActivityEnterView.java @@ -109,7 +109,6 @@ public class ChatActivityEnterView implements NotificationCenter.NotificationCen } if (sizeNotifierRelativeLayout != null) { sizeNotifierRelativeLayout.delegate = null; - sizeNotifierRelativeLayout = null; } } @@ -451,12 +450,16 @@ public class ChatActivityEnterView implements NotificationCenter.NotificationCen currentHeight = keyboardHeight; } emojiPopup.setHeight(View.MeasureSpec.makeMeasureSpec(currentHeight, View.MeasureSpec.EXACTLY)); - emojiPopup.setWidth(View.MeasureSpec.makeMeasureSpec(sizeNotifierRelativeLayout.getWidth(), View.MeasureSpec.EXACTLY)); + if (sizeNotifierRelativeLayout != null) { + emojiPopup.setWidth(View.MeasureSpec.makeMeasureSpec(sizeNotifierRelativeLayout.getWidth(), View.MeasureSpec.EXACTLY)); + } emojiPopup.showAtLocation(parentActivity.getWindow().getDecorView(), 83, 0, 0); if (!keyboardVisible) { - sizeNotifierRelativeLayout.setPadding(0, 0, 0, currentHeight); - emojiButton.setImageResource(R.drawable.ic_msg_panel_hide); + if (sizeNotifierRelativeLayout != null) { + sizeNotifierRelativeLayout.setPadding(0, 0, 0, currentHeight); + emojiButton.setImageResource(R.drawable.ic_msg_panel_hide); + } return; } emojiButton.setImageResource(R.drawable.ic_msg_panel_kb); diff --git a/TMessagesProj/src/main/java/org/telegram/ui/Views/SizeNotifierRelativeLayout.java b/TMessagesProj/src/main/java/org/telegram/ui/Views/SizeNotifierRelativeLayout.java index 41667658c..f8eb84abd 100644 --- a/TMessagesProj/src/main/java/org/telegram/ui/Views/SizeNotifierRelativeLayout.java +++ b/TMessagesProj/src/main/java/org/telegram/ui/Views/SizeNotifierRelativeLayout.java @@ -15,6 +15,7 @@ import android.graphics.drawable.Drawable; import android.widget.RelativeLayout; import org.telegram.android.AndroidUtilities; +import org.telegram.messenger.FileLog; public class SizeNotifierRelativeLayout extends RelativeLayout { @@ -39,7 +40,11 @@ public class SizeNotifierRelativeLayout extends RelativeLayout { } public void setBackgroundImage(int resourceId) { - backgroundDrawable = getResources().getDrawable(resourceId); + try { + backgroundDrawable = getResources().getDrawable(resourceId); + } catch (Throwable e) { + FileLog.e("tmessages", e); + } } public void setBackgroundImage(Drawable bitmap) { diff --git a/TMessagesProj/src/main/res/values-de/strings.xml b/TMessagesProj/src/main/res/values-de/strings.xml index e815c6bfa..08dad3b46 100644 --- a/TMessagesProj/src/main/res/values-de/strings.xml +++ b/TMessagesProj/src/main/res/values-de/strings.xml @@ -80,7 +80,7 @@ SD-Karte - offline + unsichtbar schreibt… Anhängen schreibt... diff --git a/TMessagesProj/src/main/res/values-nl/strings.xml b/TMessagesProj/src/main/res/values-nl/strings.xml index 8913dc25e..b6fbe2842 100644 --- a/TMessagesProj/src/main/res/values-nl/strings.xml +++ b/TMessagesProj/src/main/res/values-nl/strings.xml @@ -283,7 +283,7 @@ Badgenummer Kort Lang - Standaardinstelling + Systeem standaard Standaardinstelling AUTOMATISCH MEDIA OPHALEN Bij mobiel datagebruik