I tried to modify the code in order to handle stereo, the LoadWAV function:

u32 SampleCount = SampleDataSize / (sizeof(s16)*ChannelCount); s16 *SourceData = SampleData; if (ChannelCount == 1) { Result.Samples[0] = SampleData; Result.Samples[1] = 0; } else if (ChannelCount == 2) { Result.Samples[0] = PushArray(Arena, SampleCount, s16, {0,16}); Result.Samples[1] = PushArray(Arena, SampleCount, s16, {0,16}); for (u32 SampleIndex = 0; SampleIndex < SampleCount; ++SampleIndex) { Result.Samples[0][SampleIndex] = SourceData[2*SampleIndex]; Result.Samples[1][SampleIndex] = SourceData[1+2*SampleIndex]; } }

In the mixer code, inside the loop which copies the samples from the playing sound to the audio buffer I had to modify the code slightly so that it fills the right channel with the correct data:

for (u32 LoopIndex = 0; LoopIndex < ChunksToMix; ++LoopIndex) { f32 SampleP = BeginSampleP + LoopIndexC*(f32)LoopIndex; __m128 SamplePos = _mm_setr_ps(SampleP + 0.0f*dSample, SampleP + 1.0f*dSample, SampleP + 2.0f*dSample, SampleP + 3.0f*dSample); __m128i SampleIndex = _mm_cvttps_epi32(SamplePos); __m128 Frac = _mm_sub_ps(SamplePos, _mm_cvtepi32_ps(SampleIndex)); __m128 SampleValueF0 = _mm_set_ps(Sound->Samples[0][((s32 *)&SampleIndex)[0]], Sound->Samples[0][((s32 *)&SampleIndex)[1]], Sound->Samples[0][((s32 *)&SampleIndex)[2]], Sound->Samples[0][((s32 *)&SampleIndex)[3]]); __m128 SampleValueC0 = _mm_set_ps(Sound->Samples[0][((s32 *)&SampleIndex)[0] + 1], Sound->Samples[0][((s32 *)&SampleIndex)[1] + 1], Sound->Samples[0][((s32 *)&SampleIndex)[2] + 1], Sound->Samples[0][((s32 *)&SampleIndex)[3] + 1]); __m128 SampleValue0 = _mm_add_ps(_mm_mul_ps(_mm_sub_ps(One, Frac), SampleValueF0), _mm_mul_ps(Frac, SampleValueC0)); __m128 SampleValueF1 = _mm_set_ps(Sound->Samples[1][((s32 *)&SampleIndex)[0]], Sound->Samples[1][((s32 *)&SampleIndex)[1]], Sound->Samples[1][((s32 *)&SampleIndex)[2]], Sound->Samples[1][((s32 *)&SampleIndex)[3]]); __m128 SampleValueC1 = _mm_set_ps(Sound->Samples[1][((s32 *)&SampleIndex)[0] + 1], Sound->Samples[1][((s32 *)&SampleIndex)[1] + 1], Sound->Samples[1][((s32 *)&SampleIndex)[2] + 1], Sound->Samples[1][((s32 *)&SampleIndex)[3] + 1]); __m128 SampleValue1 = _mm_add_ps(_mm_mul_ps(_mm_sub_ps(One, Frac), SampleValueF1), _mm_mul_ps(Frac, SampleValueC1)); __m128 D0 = _mm_load_ps((f32 *)&Dest0[0]); __m128 D1 = _mm_load_ps((f32 *)&Dest1[0]); D0 = _mm_add_ps(D0, _mm_mul_ps(_mm_mul_ps(MasterVolume0, Volume0), SampleValue0)); D1 = _mm_add_ps(D1, _mm_mul_ps(_mm_mul_ps(MasterVolume1, Volume1), SampleValue1)); _mm_store_ps((f32 *)&Dest0[0], D0); _mm_store_ps((f32 *)&Dest1[0], D1); ++Dest0; ++Dest1; Volume0 = _mm_add_ps(Volume0, dVolumeChunk0); Volume1 = _mm_add_ps(Volume1, dVolumeChunk1); }

However that did not work, I kept getting garbage on the right channel. After some extensive testing I was sure that the data was in fact all correct and the only possible place for mistake was the actual reading code, so I tested and got it working as follows:

s16 *RightChannelSamples = (s16 *)((u08 *)Sound->Samples + Sound->SampleCount*sizeof(s16)); __m128 SampleValueF1 = _mm_set_ps(RightChannelSamples[((s32 *)&SampleIndex)[0]], RightChannelSamples[((s32 *)&SampleIndex)[1]], RightChannelSamples[((s32 *)&SampleIndex)[2]], RightChannelSamples[((s32 *)&SampleIndex)[3]]); __m128 SampleValueC1 = _mm_set_ps(RightChannelSamples[((s32 *)&SampleIndex)[0] + 1], RightChannelSamples[((s32 *)&SampleIndex)[1] + 1], RightChannelSamples[((s32 *)&SampleIndex)[2] + 1], RightChannelSamples[((s32 *)&SampleIndex)[3] + 1]); __m128 SampleValue1 = _mm_add_ps(_mm_mul_ps(_mm_sub_ps(One, Frac), SampleValueF1), _mm_mul_ps(Frac, SampleValueC1));

Sound is a loaded_sound:

struct loaded_sound { // NOTE: Sample count is divided by 8 s16 *Samples[2]; u32 SampleCount; u32 ChannelCount; };

It is written out like this in the asset packer:

for (u32 ChannelIndex = 0; ChannelIndex < WAV.ChannelCount; ++ChannelIndex) { fwrite(WAV.Samples[ChannelIndex], Dest->Sound.SampleCount*sizeof(s16), 1, Out); }

So the layout is [LEFT LEFT LEFT LEFT ...][RIGHT RIGHT RIGHT RIGHT ...]

I thought that Sound->Samples[1] would advance the entire block, but maybe C doesn't know how much to advance? I'm confused. Does anyone make sense of it all?