From 76fd9d215c25874b1c5d33355de0ed983922c32d Mon Sep 17 00:00:00 2001
From: Saracen <SaracenOne@gmail.com>
Date: Thu, 24 May 2018 21:35:39 +0100
Subject: [PATCH] Fixes for microphone clipping and latency (marcelofg55)

---
 drivers/wasapi/audio_driver_wasapi.cpp | 88 ++++++++++++++++++--------
 drivers/wasapi/audio_driver_wasapi.h   |  2 +
 servers/audio/audio_stream.cpp         | 16 +++--
 servers/audio/audio_stream.h           |  2 +-
 servers/audio_server.h                 |  5 ++
 5 files changed, 78 insertions(+), 35 deletions(-)

diff --git a/drivers/wasapi/audio_driver_wasapi.cpp b/drivers/wasapi/audio_driver_wasapi.cpp
index db09a61066c..8fe83a3be50 100644
--- a/drivers/wasapi/audio_driver_wasapi.cpp
+++ b/drivers/wasapi/audio_driver_wasapi.cpp
@@ -432,30 +432,27 @@ Error AudioDriverWASAPI::init_capture_devices(bool reinit) {
 		microphone_device_output_wasapi->frame_size = (microphone_device_output_wasapi->bits_per_sample / 8) * microphone_device_output_wasapi->channels;
 
 		microphone_device_output_wasapi->current_capture_index = 0;
+		microphone_device_output_wasapi->current_capture_size = 0;
 
-		if (pwfex->wFormatTag == WAVE_FORMAT_EXTENSIBLE) {
+		WORD format_tag = pwfex->wFormatTag;
+		if (format_tag == WAVE_FORMAT_EXTENSIBLE) {
 			WAVEFORMATEXTENSIBLE *wfex = (WAVEFORMATEXTENSIBLE *)pwfex;
 
 			if (wfex->SubFormat == KSDATAFORMAT_SUBTYPE_PCM) {
-				microphone_device_output_wasapi->microphone_format = MicrophoneDeviceOutputDirect::FORMAT_PCM;
+				format_tag = WAVE_FORMAT_PCM;
 			} else if (wfex->SubFormat == KSDATAFORMAT_SUBTYPE_IEEE_FLOAT) {
-				microphone_device_output_wasapi->microphone_format = MicrophoneDeviceOutputDirect::FORMAT_FLOAT;
+				format_tag = WAVE_FORMAT_IEEE_FLOAT;
 			} else {
 				ERR_PRINT("WASAPI: Format not supported");
 				ERR_FAIL_V(ERR_CANT_OPEN);
 			}
 		} else {
-			if (pwfex->wFormatTag != WAVE_FORMAT_PCM && pwfex->wFormatTag != WAVE_FORMAT_IEEE_FLOAT) {
+			if (format_tag != WAVE_FORMAT_PCM && format_tag != WAVE_FORMAT_IEEE_FLOAT) {
 				ERR_PRINT("WASAPI: Format not supported");
 				ERR_FAIL_V(ERR_CANT_OPEN);
-			} else {
-				if (pwfex->wFormatTag == WAVE_FORMAT_PCM) {
-					microphone_device_output_wasapi->microphone_format = MicrophoneDeviceOutputDirect::FORMAT_PCM;
-				} else {
-					microphone_device_output_wasapi->microphone_format = MicrophoneDeviceOutputDirect::FORMAT_FLOAT;
-				}
 			}
 		}
+		microphone_device_output_wasapi->capture_format_tag = format_tag;
 
 		hr = microphone_device_output_wasapi->audio_client->Initialize(AUDCLNT_SHAREMODE_SHARED, 0, REFTIMES_PER_SEC, 0, pwfex, NULL);
 		ERR_FAIL_COND_V(hr != S_OK, ERR_CANT_OPEN);
@@ -466,7 +463,7 @@ Error AudioDriverWASAPI::init_capture_devices(bool reinit) {
 		ERR_FAIL_COND_V(hr != S_OK, ERR_CANT_OPEN);
 
 		// Set the buffer size
-		microphone_device_output_wasapi->buffer.resize(max_frames * 10); // 10 second test buffer (will crash after it's been filled due to lack of looping)
+		microphone_device_output_wasapi->buffer.resize(max_frames);
 		memset(microphone_device_output_wasapi->buffer.ptrw(), 0x00, microphone_device_output_wasapi->buffer.size() * microphone_device_output_wasapi->frame_size);
 
 		// Get the capture client
@@ -611,6 +608,39 @@ void AudioDriverWASAPI::set_device(String device) {
 	unlock();
 }
 
+float AudioDriverWASAPI::read_sample(WORD format_tag, int bits_per_sample, BYTE *buffer, int i) {
+	if (format_tag == WAVE_FORMAT_PCM) {
+		int32_t sample = 0;
+		switch (bits_per_sample) {
+			case 8:
+				sample = int32_t(((int8_t *)buffer)[i]) << 24;
+				break;
+
+			case 16:
+				sample = int32_t(((int16_t *)buffer)[i]) << 16;
+				break;
+
+			case 24:
+				sample |= int32_t(((int8_t *)buffer)[i * 3 + 2]) << 24;
+				sample |= int32_t(((int8_t *)buffer)[i * 3 + 1]) << 16;
+				sample |= int32_t(((int8_t *)buffer)[i * 3 + 0]) << 8;
+				break;
+
+			case 32:
+				sample = ((int32_t *)buffer)[i];
+				break;
+		}
+
+		return (sample >> 16) / 32768.f;
+	} else if (format_tag == WAVE_FORMAT_IEEE_FLOAT) {
+		return ((float *)buffer)[i];
+	} else {
+		ERR_PRINT("WASAPI: Unknown format tag");
+	}
+
+	return 0.f;
+}
+
 void AudioDriverWASAPI::write_sample(AudioDriverWASAPI *ad, BYTE *buffer, int i, int32_t sample) {
 	if (ad->format_tag == WAVE_FORMAT_PCM) {
 		switch (ad->bits_per_sample) {
@@ -688,19 +718,27 @@ void AudioDriverWASAPI::thread_func(void *p_udata) {
 					memset((char *)(microphone_device_output_wasapi->buffer.ptrw()) + (microphone_device_output_wasapi->current_capture_index * microphone_device_output_wasapi->frame_size), 0, frames_to_copy * microphone_device_output_wasapi->frame_size);
 				} else {
 					// fixme: Only works for floating point atm
-					if (microphone_device_output_wasapi->channels == 2) {
-						for (int j = 0; j < frames_to_copy; j++) {
-							float left = *(((float *)data) + (j * 2));
-							float right = *(((float *)data) + (j * 2) + 1);
-							microphone_device_output_wasapi->buffer[microphone_device_output_wasapi->current_capture_index + j] = AudioFrame(left, right);
+					for (int j = 0; j < frames_to_copy; j++) {
+						float l, r;
+
+						if (microphone_device_output_wasapi->channels == 2) {
+							l = read_sample(microphone_device_output_wasapi->capture_format_tag, microphone_device_output_wasapi->bits_per_sample, data, j * 2);
+							r = read_sample(microphone_device_output_wasapi->capture_format_tag, microphone_device_output_wasapi->bits_per_sample, data, j * 2 + 1);
+						} else if (microphone_device_output_wasapi->channels == 1) {
+							l = r = read_sample(microphone_device_output_wasapi->capture_format_tag, microphone_device_output_wasapi->bits_per_sample, data, j);
+						} else {
+							l = r = 0.f;
+							ERR_PRINT("WASAPI: unsupported channel count in microphone!");
 						}
-					} else if (microphone_device_output_wasapi->channels == 1) {
-						for (int j = 0; j < frames_to_copy; j++) {
-							float value = *(((float *)data) + j);
-							microphone_device_output_wasapi->buffer[microphone_device_output_wasapi->current_capture_index + j] = AudioFrame(value, value);
+
+						microphone_device_output_wasapi->buffer[microphone_device_output_wasapi->current_capture_index++] = AudioFrame(l, r);
+
+						if (microphone_device_output_wasapi->current_capture_index >= microphone_device_output_wasapi->buffer.size()) {
+							microphone_device_output_wasapi->current_capture_index = 0;
+						}
+						if (microphone_device_output_wasapi->current_capture_size < microphone_device_output_wasapi->buffer.size()) {
+							microphone_device_output_wasapi->current_capture_size++;
 						}
-					} else {
-						ERR_PRINT("WASAPI: unsupported channel count in microphone!");
 					}
 				}
 
@@ -709,12 +747,6 @@ void AudioDriverWASAPI::thread_func(void *p_udata) {
 
 				hr = microphone_device_output_wasapi->capture_client->GetNextPacketSize(&packet_length);
 				ERR_BREAK(hr != S_OK);
-
-				microphone_device_output_wasapi->current_capture_index += frames_to_copy;
-
-				// Test: ensuring the read index is always behind the capture index keeps the input and output reliably in sync, but it
-				// also results in clipping, stutter and other audio artefacts
-				microphone_device_output_wasapi->set_read_index(microphone_device_output_wasapi->current_capture_index - 8192);
 			}
 		}
 
diff --git a/drivers/wasapi/audio_driver_wasapi.h b/drivers/wasapi/audio_driver_wasapi.h
index 084d0c2e3fd..e722d85353a 100644
--- a/drivers/wasapi/audio_driver_wasapi.h
+++ b/drivers/wasapi/audio_driver_wasapi.h
@@ -52,6 +52,7 @@ class AudioDriverWASAPI : public AudioDriver {
 	public:
 		IAudioClient *audio_client;
 		IAudioCaptureClient *capture_client;
+		WORD capture_format_tag;
 	};
 	//
 	Mutex *mutex;
@@ -79,6 +80,7 @@ class AudioDriverWASAPI : public AudioDriver {
 	bool active;
 
 	_FORCE_INLINE_ void write_sample(AudioDriverWASAPI *ad, BYTE *buffer, int i, int32_t sample);
+	static _FORCE_INLINE_ float read_sample(WORD format_tag, int bits_per_sample, BYTE *buffer, int i);
 	static void thread_func(void *p_udata);
 
 	StringName get_default_capture_device_name(IMMDeviceEnumerator *p_enumerator);
diff --git a/servers/audio/audio_stream.cpp b/servers/audio/audio_stream.cpp
index 8efcb5bf073..206f1861a35 100644
--- a/servers/audio/audio_stream.cpp
+++ b/servers/audio/audio_stream.cpp
@@ -155,19 +155,22 @@ void AudioStreamPlaybackMicrophone::_mix_internal(AudioFrame *p_buffer, int p_fr
 
 	AudioDriver::MicrophoneDeviceOutput *microphone_device_output = reciever->owner;
 	const Vector<AudioFrame> &source_buffer = microphone_device_output->get_buffer();
+	int current_buffer_size = microphone_device_output->get_current_buffer_size();
 
-	if (microphone_device_output->get_read_index() >= 0) {
-		for (int i = 0; i < p_frames; i++) {
-			p_buffer[i] = source_buffer[internal_mic_offset + microphone_device_output->get_read_index() + i];
+	for (int i = 0; i < p_frames; i++) {
+		if (current_buffer_size >= internal_mic_offset) {
+			if (internal_mic_offset >= source_buffer.size()) {
+				internal_mic_offset = 0;
+			}
+			p_buffer[i] = source_buffer[internal_mic_offset++];
+		} else {
+			p_buffer[i] = AudioFrame(0.f, 0.f);
 		}
 	}
-
-	internal_mic_offset += p_frames;
 }
 
 void AudioStreamPlaybackMicrophone::mix(AudioFrame *p_buffer, float p_rate_scale, int p_frames) {
 	AudioStreamPlaybackResampled::mix(p_buffer, p_rate_scale, p_frames);
-	internal_mic_offset = 0; // Reset
 }
 
 float AudioStreamPlaybackMicrophone::get_stream_sampling_rate() {
@@ -175,6 +178,7 @@ float AudioStreamPlaybackMicrophone::get_stream_sampling_rate() {
 }
 
 void AudioStreamPlaybackMicrophone::start(float p_from_pos) {
+	internal_mic_offset = 0;
 	active = true;
 
 	// note: can this be called twice?
diff --git a/servers/audio/audio_stream.h b/servers/audio/audio_stream.h
index cb3b999cebb..352cb8c5078 100644
--- a/servers/audio/audio_stream.h
+++ b/servers/audio/audio_stream.h
@@ -127,7 +127,7 @@ class AudioStreamPlaybackMicrophone : public AudioStreamPlaybackResampled {
 	friend class AudioStreamMicrophone;
 
 	bool active;
-	uint64_t internal_mic_offset;
+	uint32_t internal_mic_offset;
 
 	Ref<AudioStreamMicrophone> microphone;
 	AudioDriver::MicrophoneReciever *reciever;
diff --git a/servers/audio_server.h b/servers/audio_server.h
index f2c3aa0a6f9..68a56c38f37 100644
--- a/servers/audio_server.h
+++ b/servers/audio_server.h
@@ -90,6 +90,7 @@ public:
 
 		virtual unsigned int get_mix_rate() = 0;
 		virtual Vector<AudioFrame> &get_buffer() = 0;
+		virtual int get_current_buffer_size() = 0;
 		virtual int get_read_index() = 0;
 		virtual void set_read_index(int p_temp_index) = 0;
 
@@ -138,6 +139,7 @@ public:
 		int read_index = -2048;
 
 		unsigned int current_capture_index;
+		unsigned int current_capture_size;
 		Vector<AudioFrame> buffer;
 
 		unsigned int get_mix_rate() {
@@ -148,6 +150,8 @@ public:
 			return buffer;
 		};
 
+		int get_current_buffer_size() { return current_capture_size; }
+
 		int get_read_index() {
 			return read_index;
 		}
@@ -174,6 +178,7 @@ public:
 		void set_read_index(int p_read_index) {
 			owner->set_read_index(p_read_index);
 		}
+		int get_current_buffer_size() { return owner->get_current_buffer_size(); }
 	};
 
 	MicrophoneDeviceOutputIndirect *default_microphone_device_output;