Speech to Text な Bot のサンプルを試してみた

Bot Framewotk のドキュメントにある、Speech Bot のサンプルををベースに、Speech to Text をちょっといじってみました。
https://docs.botframework.com/en-us/bot-intelligence/speech/#navtitle

成果物

今回の作ったサンプルは、音声ファイルのURLを Bot に食わせると、その音声ファイルの内容を分析し Text にして返してくれる Bot です。
デプロイしたのは、以下のやつです。

実行するとこんな感じです。

ついでに、サンプルのコード
まぁ、ドキュメントのソースのほぼコピペなんですけど。。

using System;
using System.Net;
using System.Net.Http;
using System.Threading.Tasks;
using System.Web.Http;
using Microsoft.Bot.Connector;
// add
using System.IO;
using System.Threading;
using System.Web;
using System.Text;
using System.Runtime.Serialization.Json;
using System.Runtime.Serialization;
using Newtonsoft.Json.Linq;

namespace SeeachTestBot
{
    [BotAuthentication]
    public class MessagesController : ApiController
    {
        ///

&lt;summary&gt;
        /// POST: api/Messages
        /// Receive a message from a user and reply to it
        /// &lt;/summary&gt;


        public async Task&lt;HttpResponseMessage&gt; Post([FromBody]Activity activity)
        {
            ConnectorClient connector = new ConnectorClient(new Uri(activity.ServiceUrl));

            var text = "";

            if (activity.Type == ActivityTypes.Message)
            {
                if (activity.Text.Length &gt; 0)
                {
                    //var reco = DoSpeechReco(activity.Attachments.First());
                    var reco = DoSpeechReco(activity.Text);

                    text = "You said : " + reco;
                }
                Activity reply = activity.CreateReply(text);
                await connector.Conversations.ReplyToActivityAsync(reply);
            }
            else
            {
                 HandleSystemMessage(activity);
            }
            var response = Request.CreateResponse(HttpStatusCode.OK);
            return response;
        }

        private Activity HandleSystemMessage(Activity message)
        {
            if (message.Type == ActivityTypes.DeleteUserData)
            {
                // Implement user deletion here
                // If we handle user deletion, return a real message
            }
            else if (message.Type == ActivityTypes.ConversationUpdate)
            {
                // Handle conversation state changes, like members being added and removed
                // Use Activity.MembersAdded and Activity.MembersRemoved and Activity.Action for info
                // Not available in all channels
            }
            else if (message.Type == ActivityTypes.ContactRelationUpdate)
            {
                // Handle add/remove from contact lists
                // Activity.From + Activity.Action represent what happened
            }
            else if (message.Type == ActivityTypes.Typing)
            {
                // Handle knowing tha the user is typing
            }
            else if (message.Type == ActivityTypes.Ping)
            {
            }

            return null;
        }

        // add
        private string DoSpeechReco(String wevUrl)
        {
            AccessTokenInfo token;
            string headerValue;
            // Note: Sign up at https://microsoft.com/cognitive to get a subscription key.
            // Use the subscription key as Client secret below.
            Authentication auth = new Authentication("YOURUSERID", "&lt;&lt;BIng Speech API のキー&gt;&gt;");
            string requestUri = "https://speech.platform.bing.com/recognize";

            //URI Params. Refer to the Speech API documentation for more information.
            requestUri += @"?scenarios=smd";                                // websearch is the other main option.
            requestUri += @"&amp;appid=D4D52672-91D7-4C74-8AD8-42B1D98141A5";   // You must use this ID.
            requestUri += @"&amp;locale=en-US";                                 // read docs, for other supported languages.
//          requestUri += @"&amp;locale=ja-JP";
            requestUri += @"&amp;device.os=wp7";
            requestUri += @"&amp;version=3.0";
            requestUri += @"&amp;format=json";
            requestUri += @"&amp;instanceid=565D69FF-E928-4B7E-87DA-9A750B96D9E3";
            requestUri += @"&amp;requestid=" + Guid.NewGuid().ToString();

            string host = @"speech.platform.bing.com";
            string contentType = @"audio/wav; codec=""audio/pcm""; samplerate=16000";

            var wav = HttpWebRequest.Create(wevUrl);
            string responseString = string.Empty;

            try
            {
                token = auth.GetAccessToken();
                Console.WriteLine("Token: {0}\n", token.access_token);

                //Create a header with the access_token property of the returned token
                headerValue = "Bearer " + token.access_token;
                Console.WriteLine("Request Uri: " + requestUri + Environment.NewLine);

                HttpWebRequest request = null;
                request = (HttpWebRequest)HttpWebRequest.Create(requestUri);
                request.SendChunked = true;
                request.Accept = @"application/json;text/xml";
                request.Method = "POST";
                request.ProtocolVersion = HttpVersion.Version11;
                request.Host = host;
                request.ContentType = contentType;
                request.Headers["Authorization"] = headerValue;

                using (Stream wavStream = wav.GetResponse().GetResponseStream())
                {
                    byte[] buffer = null;
                    using (Stream requestStream = request.GetRequestStream())
                    {
                        int count = 0;
                        do
                        {
                            buffer = new byte[1024];
                            count = wavStream.Read(buffer, 0, 1024);
                            requestStream.Write(buffer, 0, count);
                        } while (wavStream.CanRead &amp;&amp; count &gt; 0);
                        // Flush
                        requestStream.Flush();
                    }
                    //Get the response from the service.
                    Console.WriteLine("Response:");
                    using (WebResponse response = request.GetResponse())
                    {
                        Console.WriteLine(((HttpWebResponse)response).StatusCode);
                        using (StreamReader sr = new StreamReader(response.GetResponseStream()))
                        {
                            responseString = sr.ReadToEnd();
                        }
                        Console.WriteLine(responseString);
                    }
                }
            }
            catch (Exception ex)
            {
                Console.WriteLine(ex.ToString());
                Console.WriteLine(ex.Message);
            }
            dynamic data = JObject.Parse(responseString);
            return data.header.name;
        }
    }

    // add
    [DataContract]
    public class AccessTokenInfo
    {
        [DataMember]
        public string access_token { get; set; }
        [DataMember]
        public string token_type { get; set; }
        [DataMember]
        public string expires_in { get; set; }
        [DataMember]
        public string scope { get; set; }
    }

    public class Authentication
    {
        public static readonly string AccessUri = "https://oxford-speech.cloudapp.net/token/issueToken";
        private string clientId;
        private string clientSecret;
        private string request;
        private AccessTokenInfo token;
        private Timer accessTokenRenewer;

        //Access token expires every 10 minutes. Renew it every 9 minutes only.
        private const int RefreshTokenDuration = 9;

        public Authentication(string clientId, string clientSecret)
        {
            this.clientId = clientId;
            this.clientSecret = clientSecret;

            //If clientid or client secret has special characters, encode before sending request
            this.request = string.Format("grant_type=client_credentials&amp;client_id={0}&amp;client_secret={1}&amp;scope={2}",
                                              HttpUtility.UrlEncode(clientId),
                                              HttpUtility.UrlEncode(clientSecret),
                                              HttpUtility.UrlEncode("https://speech.platform.bing.com"));

            this.token = HttpPost(AccessUri, this.request);

            // renew the token every specfied minutes
            accessTokenRenewer = new Timer(new TimerCallback(OnTokenExpiredCallback),
                                           this,
                                           TimeSpan.FromMinutes(RefreshTokenDuration),
                                           TimeSpan.FromMilliseconds(-1));
        }

        //Return the access token
        public AccessTokenInfo GetAccessToken()
        {
            return this.token;
        }

        //Renew the access token
        private void RenewAccessToken()
        {
            AccessTokenInfo newAccessToken = HttpPost(AccessUri, this.request);
            //swap the new token with old one
            //Note: the swap is thread unsafe
            this.token = newAccessToken;
            Console.WriteLine(string.Format("Renewed token for user: {0} is: {1}",
                              this.clientId,
                              this.token.access_token));
        }
        //Call-back when we determine the access token has expired
        private void OnTokenExpiredCallback(object stateInfo)
        {
            try
            {
                RenewAccessToken();
            }
            catch (Exception ex)
            {
                Console.WriteLine(string.Format("Failed renewing access token. Details: {0}", ex.Message));
            }
            finally
            {
                try
                {
                    accessTokenRenewer.Change(TimeSpan.FromMinutes(RefreshTokenDuration), TimeSpan.FromMilliseconds(-1));
                }
                catch (Exception ex)
                {
                    Console.WriteLine(string.Format("Failed to reschedule timer to renew access token. Details: {0}", ex.Message));
                }
            }
        }

        //Helper function to get new access token
        private AccessTokenInfo HttpPost(string accessUri, string requestDetails)
        {
            //Prepare OAuth request
            WebRequest webRequest = WebRequest.Create(accessUri);
            webRequest.ContentType = "application/x-www-form-urlencoded";
            webRequest.Method = "POST";
            byte[] bytes = Encoding.ASCII.GetBytes(requestDetails);
            webRequest.ContentLength = bytes.Length;
            using (Stream outputStream = webRequest.GetRequestStream())
            {
                outputStream.Write(bytes, 0, bytes.Length);
            }
            using (WebResponse webResponse = webRequest.GetResponse())
            {
                DataContractJsonSerializer serializer = new DataContractJsonSerializer(typeof(AccessTokenInfo));
                //Get deserialized object from JSON stream
                AccessTokenInfo token = (AccessTokenInfo)serializer.ReadObject(webResponse.GetResponseStream());
                return token;
            }
        }
    }
}

補足

Microsoft Azure の Cognitive Service は、Speech API を使います。

現時点ではこのAPIにもFreeのプランがあります。(スクリーンショットにマウスアイコンかぶってるけど。。。)

作成した、Speech API のキーを、ソースの89行目あたりに設定しています。

あとは、Azure App Services の API Appsにデプロイして、
Bot Frameworkに登録してぐにゅぐにゅするくらいです。

元のサンプル

今回自分が作ったのは、元のサンプルの機能を削った形となっています。
元のサンプルは、Microsoft Bot Framework Channel Emulator での動作確認しかしてませんが、音声のついた Wev ファイルを添付して、メッセージに「SPACE」とうつと、解析結果のスペースの数も教えてくれるBotとなってます。

音声を分析して、そのままテキスト化したメッセージを返すのではなく、その結果をさらに分析したり、そのワードで検索かけたりとか、いろいろできそうですね。

Speech to Text のおまけ

Sppech to Text を使う場合、今回のサンプルは REST でたたくパターンでしたが、SDKを利用することもできます。

GitHubにあったサンプル
https://github.com/microsoft/cognitive-speech-stt-windows

動かして、マイクでしゃべるとこんな感じ。

デフォルトだと、英語なんですけど、
MainWindows.xaml.cs の241行目あたりの「DefaultLocale」で、「ja-JP」を指定してあげれば簡単に日本語化できます。

using に、以下が指定してあります。

using Microsoft.ProjectOxford.SpeechRecognition;

あとは、StartButton_Click() あたりから、見ていくといいと思います。

まとめ

Speech to Text をやっていて、ロケールを最初に知ってなくて、分析して判断したりできたりしないかなーって思ったのですが、SDKにしろRESTにしろ、ロケールを設定しているのでどうなんだろ。

その辺が、まだよくわからないかも。

SDK の勉強がてら、UWP で音を聞いて分析するのにチャレンジしてみようかなー。