1.8 or latest code crashes in SIMD code

Issue #204 resolved
Brad Smith created an issue

Since upgrading to 1.8 or the latest code I can no longer encode anything with FFmpeg. 1.7 worked fine. Disabling the assembly code so it falls back to the C code paths works fine.

Program terminated with signal SIGSEGV, Segmentation fault.
#0  0x00000f34ed4aede9 in x265_pixel_sa8d_8x8_internal_avx2 () from /usr/local/lib/libx265.so.5.0
(gdb) bt full
#0  0x00000f34ed4aede9 in x265_pixel_sa8d_8x8_internal_avx2 () from /usr/local/lib/libx265.so.5.0
No symbol table info available.
#1  0x00000f34ed4aefff in x265_pixel_sa8d_16x16_internal_avx2 () from /usr/local/lib/libx265.so.5.0
No symbol table info available.
#2  0x00000f34ed4af0be in x265_pixel_sa8d_32x32_avx2 () from /usr/local/lib/libx265.so.5.0
No symbol table info available.
#3  0x00000f34ed421ac7 in x265::Search::estIntraPredQT (this=0xf34fdc9fbd0, intraMode=..., cuGeom=..., depthRange=0xf351ba5d6d0)
    at /home/ports/pobj/x265-1.9/x265_1.9/source/encoder/search.cpp:1527
        mode = 34
        fenc = 0xf34b73b0400 '\020' <repeats 200 times>...
        costShift = 2
        stride = 64
        mpms = 67108867
        sad = 114688
        scaleStride = 32
        mpmModes = {0, 1, 26}
        sa8d = 0xf34ed4af050 <x265_pixel_sa8d_32x32_avx2>
        modeCosts = {114689, 114691, 114696 <repeats 24 times>, 114691, 114696, 114696, 114696, 114696, 114696, 114696, 114696, 0}
        bits = 6
        planar = 0xf34fdc9fc38 '\200' <repeats 200 times>...
        paddedBcost = 0
        intraNeighbors = {numIntraNeighbor = 0, totalUnits = 65, aboveUnits = 32, leftUnits = 32, unitWidth = 4, unitHeight = 4, log2TrSize = 6, bNeighborFlags = {
            false <repeats 65 times>}}
        scaleTuSize = 32
        rbits = 6
        candCostList = {16720821992704, 16719027000128, 15, 4294967295, 16716901651536, 16718909078553, 25769803775, 25769803781, 16720771536448, 16719996900323, 25769803777, 8589934597, 
          16720771536496, 16719996878099, 463853168, 6}
        rdModeList = {0, 0, 0, 0, 2900807980, 3892, 2896359272, 3892, 266242, 0, 3623422084, 1074018641, 514309376, 3893, 270336, 0}
        bcost = 114689
        maxCandCount = 5
        bmode = 0
        icosts = {rdcost = 0, bits = 0, distortion = 0, energy = 0}
        puIdx = 0
        cu = @0xf34fdca62d8: {static s_partSet = {0xf34ed793e30 <bcast256(unsigned char*, unsigned char)>, 0xf34ed793e70 <bcast64(unsigned char*, unsigned char)>, 
            0xf34ed793f20 <bcast16(unsigned char*, unsigned char)>, 0xf34ed793f80 <bcast4(unsigned char*, unsigned char)>, 0xf34ed793fc0 <bcast1(unsigned char*, unsigned char)>}, 
          static s_numPartInCUSize = 16, m_encData = 0xf3493c9e400, m_slice = 0xf351ac36670, m_partCopy = 0xf34ed793ff0 <copy256(unsigned char*, unsigned char*)>, 
          m_partSet = 0xf34ed793e30 <bcast256(unsigned char*, unsigned char)>, m_subPartCopy = 0xf34ed794030 <copy64(unsigned char*, unsigned char*)>, 
          m_subPartSet = 0xf34ed793e70 <bcast64(unsigned char*, unsigned char)>, m_cuAddr = 0, m_absIdxInCTU = 0, m_cuPelX = 0, m_cuPelY = 0, m_numPartitions = 256, m_chromaFormat = 1, 
          m_hChromaShift = 1, m_vChromaShift = 1, m_qp = 0xf34f71f2a00 '\017' <repeats 200 times>..., m_log2CUSize = 0xf34f71f2b00 '\006' <repeats 200 times>..., 
          m_lumaIntraDir = 0xf34f71f2c00 '\377' <repeats 200 times>..., m_tqBypass = 0xf34f71f2e00 "", m_refIdx = {0xf34f71f2f00 '\377' <repeats 200 times>..., 
            0xf34f71f3000 '\377' <repeats 200 times>...}, m_cuDepth = 0xf34f71f3100 "", m_predMode = 0xf34f71f3200 '\002' <repeats 200 times>..., m_partSize = 0xf34f71f3300 "", 
          m_mergeFlag = 0xf34f71f3400 "", m_interDir = 0xf34f71f3500 "", m_mvpIdx = {0xf34f71f3600 "", 0xf34f71f3700 ""}, m_tuDepth = 0xf34f71f3800 "", m_transformSkip = {
            0xf34f71f3900 "", 0xf34f71f3a00 "", 0xf34f71f3b00 ""}, m_cbf = {0xf34f71f3c00 "", 0xf34f71f3d00 "", 0xf34f71f3e00 ""}, 
          m_chromaIntraDir = 0xf34f71f2d00 '\377' <repeats 200 times>..., m_trCoeff = {0xf34f4a5e000, 0xf34f4a60000, 0xf34f4a60800}, m_mv = {0xf34fc8de000, 0xf34fc8de400}, m_mvd = {
            0xf34fc8de800, 0xf34fc8dec00}, m_cuAboveLeft = 0x0, m_cuAboveRight = 0x0, m_cuAbove = 0x0, m_cuLeft = 0x0}
        reconYuv = 0xf34fdca6460
        predYuv = 0xf34fdca6430
        fencYuv = 0xf34fdcabc80
        depth = 0
        initTuDepth = 0
        numPU = 1
        log2TrSize = 6
        tuSize = 64
        qNumParts = 64
        sizeIdx = 3
        absPartIdx = 0
        totalDistortion = 0
        checkTransformSkip = 0
#4  0x00000f34ed420c56 in x265::Search::checkIntra (this=0xf34fdc9fbd0, intraMode=..., cuGeom=..., partSize=x265::SIZE_2Nx2N)
    at /home/ports/pobj/x265-1.9/x265_1.9/source/encoder/search.cpp:1171
        cu = @0xf34fdca62d8: {static s_partSet = <same as static member of an already seen type>, static s_numPartInCUSize = 16, m_encData = 0xf3493c9e400, m_slice = 0xf351ac36670, 
          m_partCopy = 0xf34ed793ff0 <copy256(unsigned char*, unsigned char*)>, m_partSet = 0xf34ed793e30 <bcast256(unsigned char*, unsigned char)>, 
          m_subPartCopy = 0xf34ed794030 <copy64(unsigned char*, unsigned char*)>, m_subPartSet = 0xf34ed793e70 <bcast64(unsigned char*, unsigned char)>, m_cuAddr = 0, m_absIdxInCTU = 0, 
          m_cuPelX = 0, m_cuPelY = 0, m_numPartitions = 256, m_chromaFormat = 1, m_hChromaShift = 1, m_vChromaShift = 1, m_qp = 0xf34f71f2a00 '\017' <repeats 200 times>..., 
          m_log2CUSize = 0xf34f71f2b00 '\006' <repeats 200 times>..., m_lumaIntraDir = 0xf34f71f2c00 '\377' <repeats 200 times>..., m_tqBypass = 0xf34f71f2e00 "", m_refIdx = {
            0xf34f71f2f00 '\377' <repeats 200 times>..., 0xf34f71f3000 '\377' <repeats 200 times>...}, m_cuDepth = 0xf34f71f3100 "", 
          m_predMode = 0xf34f71f3200 '\002' <repeats 200 times>..., m_partSize = 0xf34f71f3300 "", m_mergeFlag = 0xf34f71f3400 "", m_interDir = 0xf34f71f3500 "", m_mvpIdx = {
            0xf34f71f3600 "", 0xf34f71f3700 ""}, m_tuDepth = 0xf34f71f3800 "", m_transformSkip = {0xf34f71f3900 "", 0xf34f71f3a00 "", 0xf34f71f3b00 ""}, m_cbf = {0xf34f71f3c00 "", 
            0xf34f71f3d00 "", 0xf34f71f3e00 ""}, m_chromaIntraDir = 0xf34f71f2d00 '\377' <repeats 200 times>..., m_trCoeff = {0xf34f4a5e000, 0xf34f4a60000, 0xf34f4a60800}, m_mv = {
            0xf34fc8de000, 0xf34fc8de400}, m_mvd = {0xf34fc8de800, 0xf34fc8dec00}, m_cuAboveLeft = 0x0, m_cuAboveRight = 0x0, m_cuAbove = 0x0, m_cuLeft = 0x0}
        tuDepthRange = {5, 5}
        bCodeDQP = false

Comments (48)

  1. Deepthi Nandakumar

    Very odd. You can disable the AVX code paths by using --asm

    Let us know what you find. Whats your ffmpeg build and encode commandline?

  2. Brad Smith reporter

    I don't mean disabling AVX via the x265 program but utilizing x265 via FFmpeg. Is there a way of disabling just AVX at build time (and leaving SSE enabled)?

  3. Brad Smith reporter

    It looks like FFmpeg's -cpuflags doesn't make any difference with x265. I assume without looking at the code that x265 does CPU feature detection independently of FFmpeg.

  4. Deepthi Nandakumar

    Yes, x265 does detect CPU independently.

    There's no way/no need to disable specific instruction sets at build time (unless you dont have yasm, in which case ENABLE_ASSEMBLY is turned off). I haven tried this with ffmpeg, but I assume you could pass the asm flag as part of the x265 option list.

  5. Brad Smith reporter

    I see crashing also with the x265 program. If I disable most of the SIMD code except for MMX and SSE, as in SSE2 or newer then it encodes fine. I also noticed the SIGILL with XOP.

    $ x265 --asm "mmx2,sse,sse2" cats.y4m cats.hevc 
    y4m  [info]: 592x320 fps 25/1 i420p8 sar 1:1 frames 0 - 975 of 976
    raw  [info]: output file: cats.hevc
    x265 [info]: HEVC encoder version 1.8+43-04575a459a16
    x265 [info]: build info [OpenBSD][clang 3.5.0][64 bit] 8bit
    x265 [info]: using cpu capabilities: MMX2 SSE2
    x265 [info]: Main profile, Level-2.1 (Main tier)
    x265 [info]: Thread pool created using 4 threads
    x265 [info]: frame threads / pool features       : 2 / wpp(5 rows)
    x265 [info]: Coding QT: max CU size, min CU size : 64 / 8
    x265 [info]: Residual QT: max TU size, max depth : 32 / 1 inter / 1 intra
    x265 [info]: ME / range / subpel / merge         : hex / 57 / 2 / 2
    x265 [info]: Keyframe min / max / scenecut       : 25 / 250 / 40
    x265 [info]: Lookahead / bframes / badapt        : 20 / 4 / 2
    x265 [info]: b-pyramid / weightp / weightb       : 1 / 1 / 0
    x265 [info]: References / ref-limit  cu / depth  : 3 / 0 / 0
    x265 [info]: AQ: mode / str / qg-size / cu-tree  : 1 / 1.0 / 32 / 1
    x265 [info]: Rate Control / qCompress            : CRF-28.0 / 0.60
    x265 [info]: tools: rd=3 psy-rd=0.30 signhide tmvp strong-intra-smoothing
    x265 [info]: tools: deblock sao
    Segmentation fault (core dumped)
    
    Program terminated with signal SIGILL, Illegal instruction.
    #0  0x0000126665d385f1 in x265_frame_init_lowres_core_xop () from /usr/local/lib/libx265.so.5.0
    
    Program terminated with signal SIGSEGV, Segmentation fault.
    #0  0x0000012e7faf68fc in x265_mbtree_propagate_cost_avx2 () from /usr/local/lib/libx265.so.5.0
    
    Program terminated with signal SIGSEGV, Segmentation fault.
    #0  0x00001048c1a3a86c in x265_mbtree_propagate_cost_avx () from /usr/local/lib/libx265.so.5.0
    
    Program terminated with signal SIGSEGV, Segmentation fault.
    #0  0x00000c13a0ecd7c3 in x265_mbtree_propagate_cost_sse2 () from /usr/local/lib/libx265.so.5.0
    
  6. M CHEN

    In above report, it is my AVX version of mb_tree_propageate_cost bug, I was fixed it in last Friday.

  7. M CHEN

    I sent a patch to fix constant read overflow bug, could you try again?

    btw: What's your platform, OS X or Linux?

  8. Brad Smith reporter

    Where is this patch located? I could try it if I am pointed in the right direction.

    Neither, OpenBSD.

  9. Former user Account Deleted

    Can you provide your CPU information ? Also, can you share the command prompt output for "using cpu capabilities" without "--asm" option (default x265 detected cpu capabilities)?

  10. Brad Smith reporter
    cpu0: Intel(R) Core(TM) i7-4600U CPU @ 2.10GHz, 1995.65 MHz
    cpu0: FPU,VME,DE,PSE,TSC,MSR,PAE,MCE,CX8,APIC,SEP,MTRR,PGE,MCA,CMOV,PAT,PSE36,CFLUSH,DS,ACPI,MMX,FXSR,SSE,SSE2,SS,HTT,TM,PBE,SSE3,PCLMUL,DTES64,MWAIT,DS-CPL,VMX,SMX,EST,TM2,SSSE3,FMA3,CX16,xTPR,PDCM,PCID,SSE4.1,SSE4.2,x2APIC,MOVBE,POPCNT,DEADLINE,AES,XSAVE,AVX,F16C,RDRAND,NXE,PAGE1GB,LONG,LAHF,ABM,PERF,ITSC,FSGSBASE,BMI1,AVX2,SMEP,BMI2,ERMS,INVPCID,SENSOR,ARAT
    
    x265 [info]: using cpu capabilities: MMX2 SSE2Fast SSSE3 SSE4.2 AVX AVX2 FMA3 LZCNT BMI2
    
  11. Brad Smith reporter

    No, OpenBSD is not reporting that AVX2 is not supported. It works fine and worked fine with 1.7. AVX2 code works fine with other programs / libraries such as FFmpeg, x264 and libvpx. The crashing also is not specific to AVX code as shown above.

  12. Brad Smith reporter

    From the post above where I am testing with the x265 program the crashing is not specific to the AVX2 code path. Crashing also happens with AVX and SSE2.

  13. M CHEN

    OS X 10.11 no crash with x265 cmdline. Of course, I haven't your sequence, I use 720p50_parkrun_ter.y4m and city_4cif_60fps.y4m

  14. Former user Account Deleted

    We are not able to reproduce this issue at our end. If possible, can you give access to your machine so we can check the problem ? We are trying to install OpenBSD 5.8 to further check this issue. Can you share the test-sequence you have used for encoding ?

  15. Brad Smith reporter

    I'll get back to you on the weekend when I have time to do further testing on my test system and provide access to it. There is nothing special about the test sequence. It's just converting any input video via either FFmpeg or the x265 program to HEVC.

  16. Sami Farin

    running avx2 code on avx CPU. "Feature" added in r11101.

    => 0x00007ffff57fc000 <x265_mbtree_propagate_cost_avx+16>:  vbroadcasti128 0x324ef7(%rip),%ymm5        # 0x7ffff5b20f00
    
  17. Brad Smith reporter

    Testing on my test box (as opposed to my laptop) I also see the same crashing though with only SSE2 as this system doesn't have AVX / AVX2. So if anyone wants access to this system to play around with it let me know via e-mail.. brad at comstyle.com

    $ egdb x265 x265.core
    GNU gdb (GDB) 7.10
    Copyright (C) 2015 Free Software Foundation, Inc.
    License GPLv3+: GNU GPL version 3 or later <http://gnu.org/licenses/gpl.html>
    This is free software: you are free to change and redistribute it.
    There is NO WARRANTY, to the extent permitted by law.  Type "show copying"
    and "show warranty" for details.
    This GDB was configured as "x86_64-unknown-openbsd5.8".
    Type "show configuration" for configuration details.
    For bug reporting instructions, please see:
    <http://www.gnu.org/software/gdb/bugs/>.
    Find the GDB manual and other documentation resources online at:
    <http://www.gnu.org/software/gdb/documentation/>.
    For help, type "help".
    Type "apropos word" to search for commands related to "word"...
    Reading symbols from x265...done.
    [New process 615]
    [New process 4367]
    [New process 29347]
    [New process 23420]
    [New process 21565]
    Core was generated by `x265'.
    Program terminated with signal SIGSEGV, Segmentation fault.
    #0  x265_mbtree_propagate_cost_sse2.loop () at /usr/ports/pobj/x265-1.9/x265_1.9/source/common/x86/mc-a2.asm:1008
    1008        movh        m2, [r2+r5*4]       ; intra
    [Current thread is 1 (process 615)]
    
  18. M CHEN

    Could you double check C code? Crash On this line, maybe input 'len' have some problem, may you show all registers and parameters? Thanks

  19. Brad Smith reporter

    Using --no-asm x265 is able to encode content just fine.

    (gdb) info registers
    rax            0x128856e89620   20376782935584
    rbx            0xafc    2812
    rcx            0x1288882ecd7e   20377609620862
    rdx            0x128859f94f6c   20376834363244
    rsi            0x12886ab77000   20377115258880
    rdi            0x128853f3a500   20376733328640
    rbp            0x128808026d70   0x128808026d70
    rsp            0x128808026bf8   0x128808026bf8
    r8             0x128842146f6c   20376433487724
    r9             0x24     36
    r10            0x1288882ecd7e   20377609620862
    r11            0x128842146f6c   20376433487724
    r12            0x12888030dd00   20377475538176
    r13            0x12888fe86650   20377739224656
    r14            0x12880802e9d0   20375459260880
    r15            0x25     37
    rip            0x128856e89643   0x128856e89643 <x265_mbtree_propagate_cost_sse2.loop>
    eflags         0x10297  [ CF PF AF SF IF RF ]
    cs             0x2b     43
    ss             0x23     35
    ds             0x23     35
    es             0x23     35
    fs             0x23     35
    gs             0x23     35
    (gdb) bt full
    #0  x265_mbtree_propagate_cost_sse2.loop () at /home/ports/pobj/x265-1.9/x265_1.9/source/common/x86/mc-a2.asm:1008
    No locals.
    #1  0x0000128856e09666 in x265::Lookahead::estimateCUPropagate (this=0x1287aa132e00, frames=0x12880802e9d0, averageDuration=0.040000000000000008, p0=17, p1=19, b=18, referenced=0)
        at /home/ports/pobj/x265-1.9/x265_1.9/source/encoder/slicetype.cpp:1737
            cuIndex = 703
            blocky = 19
            refCosts = {0x128831c75800, 0x1287b0119000}
            distScaleFactor = 128
            bipredWeight = 32
            bipredWeights = {32, 32}
            listDist = {0, 0}
            propagateCost = 0x12886ab77000
            fpsFactor = 0.99999999999999978
            strideInCU = 37
    
  20. Former user Account Deleted

    i think, he is asking the test-sequence(input video) you have used for encoding.

  21. Brad Smith reporter

    If that is what he meant then there is nothing special about what I am doing. I just convert any video I have around to Y4M using FFmpeg ("ffmpeg -i input.[mkv|mp4|avi] output.y4m") and then just run "x265 input.y4m output.hevc".

  22. Brad Smith reporter

    That's not surprising at all. As I said earlier, if anyone wants access to my test box to see what's up let me know and I will provide you with access.

  23. Former user Account Deleted

    We have installed OpenBSD here. Surprisingly, I am seeing crash with "--no-asm" option as well. I am debugging the issue to find the root cause.

  24. Brad Smith reporter

    Ok, the reason I ask is i386 sometimes exposes issues that won't show up on amd64. Is the crash you're seeing with --no-asm happen to be a stack protector related crash?

  25. Former user Account Deleted
    GNU gdb (GDB) 7.9.1
    Copyright (C) 2015 Free Software Foundation, Inc.
    License GPLv3+: GNU GPL version 3 or later <http://gnu.org/licenses/gpl.html>
    This is free software: you are free to change and redistribute it.
    There is NO WARRANTY, to the extent permitted by law.  Type "show copying"
    and "show warranty" for details.
    This GDB was configured as "i386-unknown-openbsd5.8".
    Type "show configuration" for configuration details.
    For bug reporting instructions, please see:
    <http://www.gnu.org/software/gdb/bugs/>.
    Find the GDB manual and other documentation resources online at:
    <http://www.gnu.org/software/gdb/documentation/>.
    For help, type "help".
    Type "apropos word" to search for commands related to "word"...
    Reading symbols from x265...done.
    [New process 26331]
    [New process 16640]
    [New process 16995]
    [New process 25660]
    [New process 29411]
    [New process 5564]
    [New process 17584]
    [New process 19230]
    Core was generated by `x265'.
    Program terminated with signal SIGSEGV, Segmentation fault.
    #0  0x15d8750d in (anonymous namespace)::_sa8d_8x8 (
        pix1=0x78ac5400 "mjlmlmlprtrsustuwvvvvvvtuxxzzyy|llklmlmpqtsustuxyxwwyxytutyyxxzykimkmmpnrrssttv{|ywxz{zvwxxyzzxylklmmonprtrtvtvy|{zzzy~ttvxy{yz~mimlkplqpsttsuux|z{z|||uwwzzzz{{lmlmllmrruuuwuwy{|{{{|~wvwxzz{z{mlmlmnos"...,
        i_pix1=32, pix2=0x20 <error: Cannot access memory at address 0x20>, i_pix2=0) at /home/mcw/projects/x265/source/common/pixel.cpp:283
    283             a0 = pix1[0] - pix2[0];
    
  26. M CHEN

    I guess problem on OpenBSD type 'intptr_t'. @Dnyaneshwar, could you try to modify 'intptr_t' to 'int' and check again.

  27. Brad Smith reporter

    With the latest code plus that patch it resolves the crashing issues. I can now encode content using x265 with SSE2 / AVX / AVX2 as well as via FFmpeg. Thank you very much.

  28. Deepthi Nandakumar

    asm: fix mbtree_propagate_cost asm failure, fixes #204

    The SSE2 asm code reads and write extra 4 bytes if loop counter is not multiple of 2 as SSE2 asm code process 2 int values in single iteration

    The AVX asm code reads and write extra 4,8 or 12 bytes if loop counter is not multiple of 4 as AVX asm code process 4 int values in single iteration

    → <<cset a95e4de632bd>>

  29. Log in to comment