Scrolling webpages with Web Speech API

Would you like to try utilizing the Web Speech API for something? What about utilizing voice commands to navigate a website?

Here is a quick introduction to the Web Speech API before we get started.

What is the Web Speech API?

Starting off, Web Speech API allows you to add voice capabilities into your web app. It has two components: SpeechSynthesis and SpeechRecognition

SpeechSynthesis is used to convert text into speech, and SpeechRecognitionis used to convert speech into text.

We’ll be concentrating just on the SpeechRecognition component of the Web Speech API for the sake of this demonstration.

In simplest terms, phonemes—the sounds we make when we speak—are matched with written words in speech recognition.

Let's begin

In the following sections, we will start to code a simple HTML page with long scrollable text and the JavaScript code needed to make this page scroll via speech commands.

HTML

Create a simple page with lot’s of text to scroll through.

				
					<!DOCTYPE html>
<html lang="en">
    <head>
        <meta charset="utf-8">
        <title>Page Scroller using Web Speech API</title>
        <meta name="description" content="Page Scroller using Web Speech API">
        <meta name="author" content="Cloudoki">
        
        <!-- Mobile Specific Metas
        –––––––––––––––––––––––––––––––––––––––––––––––––– -->
        <meta name="viewport" content="width=device-width, initial-scale=1">
        
        <!-- CSS
        –––––––––––––––––––––––––––––––––––––––––––––––––– -->
        <link rel="preload" href="styles/custom.css" data-rocket-async="style" as="style" onload="this.onload=null;this.rel='stylesheet'" onerror="this.removeAttribute('data-rocket-async')" >
                
        <!-- Favicon
        –––––––––––––––––––––––––––––––––––––––––––––––––– -->
        <link rel="icon" type="image/png" href="images/icon.png">
    </head>
    <body>
		<div class="container">
            <h1>Page Scroller using Web Speech API</h1>

            <h6>click the scroller button then say:</h6>
            <h6>"scroll + [up, down, top, bottom]"</h6>
            <h6>click again to stop</h6>

            <div class="lipsum">
                [LARGE TEXT HERE]
            </div>

            <button class="scroller">SCROLLER</button>
		</div>

		<script src="scripts/index.js" defer></script>  
    <script>"use strict";function wprRemoveCPCSS(){var preload_stylesheets=document.querySelectorAll('link[data-rocket-async="style"][rel="preload"]');if(preload_stylesheets&&0<preload_stylesheets.length)for(var stylesheet_index=0;stylesheet_index<preload_stylesheets.length;stylesheet_index++){var media=preload_stylesheets[stylesheet_index].getAttribute("media")||"all";if(window.matchMedia(media).matches)return void setTimeout(wprRemoveCPCSS,200)}var elem=document.getElementById("rocket-critical-css");elem&&"remove"in elem&&elem.remove()}window.addEventListener?window.addEventListener("load",wprRemoveCPCSS):window.attachEvent&&window.attachEvent("onload",wprRemoveCPCSS);</script><noscript><link data-minify="1" rel='stylesheet' id='sgr-css' href='https://apisuite.io/wp-content/cache/min/1/wp-content/plugins/simple-google-recaptcha/sgr.css?ver=1714541649' type='text/css' media='all' /><link data-minify="1" rel='stylesheet' id='trp-language-switcher-style-css' href='https://apisuite.io/wp-content/cache/min/1/wp-content/plugins/translatepress-multilingual/assets/css/trp-language-switcher.css?ver=1714541649' type='text/css' media='all' /><link data-minify="1" rel='stylesheet' id='ionicons-css' href='https://apisuite.io/wp-content/cache/min/1/wp-content/themes/upqode/assets/css/lib/ionicons.css?ver=1714541649' type='text/css' media='all' /><link data-minify="1" rel='stylesheet' id='bootstrap-css' href='https://apisuite.io/wp-content/cache/min/1/wp-content/themes/upqode/assets/css/lib/bootstrap.css?ver=1714541649' type='text/css' media='all' /><link data-minify="1" rel='stylesheet' id='upqode-general-css' href='https://apisuite.io/wp-content/cache/min/1/wp-content/themes/upqode/assets/css/general.css?ver=1714541649' type='text/css' media='all' /><link data-minify="1" rel='stylesheet' id='upqode-header-css' href='https://apisuite.io/wp-content/cache/min/1/wp-content/themes/upqode/assets/css/header.css?ver=1714541649' type='text/css' media='all' /><link data-minify="1" rel='stylesheet' id='upqode-footer-css' href='https://apisuite.io/wp-content/cache/min/1/wp-content/themes/upqode/assets/css/footer.css?ver=1714541649' type='text/css' media='all' /><link data-minify="1" rel='stylesheet' id='upqode-typography-css' href='https://apisuite.io/wp-content/cache/min/1/wp-content/themes/upqode/assets/css/typography.css?ver=1714541649' type='text/css' media='all' /><link data-minify="1" rel='stylesheet' id='upqode-blog-single-css' href='https://apisuite.io/wp-content/cache/min/1/wp-content/themes/upqode/assets/css/blog/blog-single.css?ver=1714541659' type='text/css' media='all' /><link data-minify="1" rel='stylesheet' id='upqode-sidebar-css' href='https://apisuite.io/wp-content/cache/min/1/wp-content/themes/upqode/assets/css/blog/sidebar.css?ver=1714541649' type='text/css' media='all' /><link data-minify="1" rel='stylesheet' id='upqode-main-style-css' href='https://apisuite.io/wp-content/cache/min/1/wp-content/themes/upqode/assets/css/style.css?ver=1714541649' type='text/css' media='all' /><link rel='stylesheet' id='upqode-swiper-style-css' href='https://apisuite.io/wp-content/themes/upqode/assets/css/lib/swiper.min.css?ver=6.6.1' type='text/css' media='all' /><link data-minify="1" rel='stylesheet' id='upqode-style-css' href='https://apisuite.io/wp-content/cache/min/1/wp-content/themes/upqode/style.css?ver=1714541649' type='text/css' media='all' /><link data-minify="1" rel='stylesheet' id='elementor-icons-css' href='https://apisuite.io/wp-content/cache/min/1/wp-content/plugins/elementor/assets/lib/eicons/css/elementor-icons.min.css?ver=1714541649' type='text/css' media='all' /><link rel='stylesheet' id='elementor-frontend-css' href='https://apisuite.io/wp-content/plugins/elementor/assets/css/frontend-lite.min.css?ver=3.23.3' type='text/css' media='all' /><link data-minify="1" rel='stylesheet' id='swiper-css' href='https://apisuite.io/wp-content/cache/min/1/wp-content/plugins/elementor/assets/lib/swiper/v8/css/swiper.min.css?ver=1714541649' type='text/css' media='all' /><link rel='stylesheet' id='elementor-post-6-css' href='https://apisuite.io/wp-content/uploads/elementor/css/post-6.css?ver=1714541648' type='text/css' media='all' /><link rel='stylesheet' id='elementor-pro-css' href='https://apisuite.io/wp-content/plugins/elementor-pro/assets/css/frontend-lite.min.css?ver=3.23.1' type='text/css' media='all' /><link rel='stylesheet' id='elementor-global-css' href='https://apisuite.io/wp-content/uploads/elementor/css/global.css?ver=1714541648' type='text/css' media='all' /><link rel='stylesheet' id='elementor-post-5316-css' href='https://apisuite.io/wp-content/uploads/elementor/css/post-5316.css?ver=1714541861' type='text/css' media='all' /><link data-minify="1" rel='stylesheet' id='moove_gdpr_frontend-css' href='https://apisuite.io/wp-content/cache/min/1/wp-content/plugins/gdpr-cookie-compliance/dist/styles/gdpr-main-nf.css?ver=1714541649' type='text/css' media='all' /><link rel="stylesheet" href="styles/custom.css"><link data-minify="1" rel='stylesheet' id='upqode-heading-css' href='https://apisuite.io/wp-content/cache/min/1/wp-content/themes/upqode/assets/css/widgets/heading.css?ver=1714541858' type='text/css' media='all' /><link rel='stylesheet' id='prismjs_style-css' href='https://apisuite.io/wp-content/plugins/elementor-pro/assets/css/modules/code-highlight.min.css?ver=1.23.0' type='text/css' media='' /></noscript></body>
</html>

JAVASCRIPT

In our script, let’s first see if the browser supports the Web Speech API.

				
					try {
  var SpeechRecognition = SpeechRecognition || webkitSpeechRecognition || null;
}
catch(err) {
  console.error('Starting Web Speech API Error:', err.message);
  var SpeechRecognition = null;
}

function init () {
  // initialize speechRecognition if supported
  if (SpeechRecognition === null) {
    alert('Web Speech API is not supported.');
  } else {
    console.log('Web Speech API is supported.');
  }
}

window.addEventListener('load', function() {
  init();
}, false);

If the SpeechRecognition variable is null, then the browser does not support the Web Speech API. Otherwise, we can use it to transcribe our speech.

*Note:

At the time of writing this article, the browsers that supported the Web Speech API were limited. We recommend using Google Chrome for this.

Starting the speech recognition

To start the recognizer, we initialize it by creating a new SpeechRecognition object instance var recognizer = new SpeechRecognition(); and setting its properties.

SpeechRecognition.continuous

This controls whether continuous results are returned for each recognition, or only a single result. Defaults to single (false)

SpeechRecognition.interimResults

Controls whether interim results should be returned (true) or not (false). This means we get results that are not the final result.

SpeechRecognition.lang

This returns and sets the language of the current SpeechRecognition. If not set, it uses the HTML lang attribute value or the user agent language. It’s a good practice to set this value.

SpeechRecognition.maxAlternatives

This sets the maximum number of alternatives provided per result.

				
					 if (recognizer.continuous) {
    recognizer.continuous = true;
  }
  recognizer.interimResults = true; // we want partial result
  recognizer.lang = 'en-US'; // set language
  recognizer.maxAlternatives = 2; // number of alternatives for the recognized speech

When we click the scroller button the recogniser should start by using the recognizer.start() and by clicking it again it should stop using the method recognizer.stop(). We’ll also use a state variable to monitor if the recogniser is listening and has really started.

				
					var scrollerClass = '.scroller'

// try to get SpeechRecognition
try {
  var SpeechRecognition = SpeechRecognition || webkitSpeechRecognition || null;
}
catch(err) {
  console.error('Starting Web Speech API Error:', err.message);
  var SpeechRecognition = null;
}

/**
* Initialize the Speech Recognition functions
*/
function startSpeechRecognier () {
  // state used to to start and stop the detection
  var state = {
    'listening': false,
    'started': false,
  };
  var scroller = document.querySelector(scrollerClass); // button to start and stop the recognizer
  var recognizer = new SpeechRecognition();

  // set recognizer to be continuous
  if (recognizer.continuous) {
    recognizer.continuous = true;
  }
  recognizer.interimResults = true; // we want partial result
  recognizer.lang = 'en-US'; // set language
  recognizer.maxAlternatives = 2; // number of alternatives for the recognized speech

  recognizer.onstart = function () {
    // listening started
    state.started = true;
    scroller.innerHTML = 'listening';
    console.log('onstart');
  };

  scroller.onclick = function () {
    if (state.listening === false) {
      try {
        state.listening = true;
        // start recognizer
        recognizer.start();
        console.log('start clicked');
        // if after 3 seconds it doesn't start stop and show message to user
        setTimeout(function () {
          if(!state.started && state.listening) {
            scroller.click();
            alert('Web Speech API seems to not be working. Check if you gave permission to access the microphone or try with another browser.');
          }
        }, 3000)
      } catch(ex) {
        console.log('Recognition error: ' + ex.message);
        alert('Failed to start recognizer.');
      }
    } else {
      state.listening = false;
      state.started = false;
      // stop recognizer
      recognizer.stop();
      scroller.innerHTML = 'scroller';
      console.log('stop clicked');
    }
  }

}

function init () {
  // initialize speechRecognition if supported
  if (SpeechRecognition === null) {
    alert('Web Speech API is not supported.');
  } else {
    startSpeechRecognier();
    console.log('initialized...');
  }
}

window.addEventListener('load', function() {
  init();
}, false);

Getting and handling the results

After starting the recognizer, there are several events that will occur that we can use to get the results or information (like the event presented in the code above, recognizer.onstart that is triggered when the service starts to listen to voice inputs). The method we are going to use to get the results is the SpeechRecognition.onresult, fired every time we get a successful result.

So inside the startSpeechRecognizerfunction, we’ll add:

				
					recognizer.onresult = function (event) {
    // got results
    // the event holds the results
    if (typeof(event.results) === 'undefined') {
        // something went wrong...
        recognizer.stop();
        return;
    }

    for (var i = event.resultIndex; i < event.results.length; ++i) {
      if(event.results[i].isFinal) {
        // get all the final detected text into an array
        var results = [];
        for(var j = 0; j < event.results[i].length; ++j) {
          // how confidente (between 0 and 1) is the service that the transcription is correct
          var confidence = event.results[i][j].confidence.toFixed(4);
          // the resuting transcription
          var transcript = event.results[i][j].transcript;
          results.push({ 'confidence': confidence, 'text': transcript });
        }

        console.log('Final results:', results);
      } else {
        // got partial result
        console.log('Partial:', event.results[i][0].transcript, event.results[i].length);
      }
    }
  };

The event parameter contains the array (SpeechRecognitionResultList) of results of the service. That array may contain the property isFinal. This means that we’ll be able to know if it’s a final result or a partial one if it contains another array of objects with the properties confidence and transcript.

The confidence values (between 0 and 1) allow us to know how much the service is sure that the speech matches the transcription we received. The transcript is the text that the service generates based on what it understands from our voice input.

We’ll also use the events onend, onspeechend and onerror to determine when the service stopped so we can start to listen again, when the service detected that our speech stopped so we know it’s not listening to new sentences; and if an error occurred, respectively.

				
					recognizer.onend = function () {
    // listening ended
    console.log('onend');
    if (state.listening) {
      recognizer.start();
    }
  };

  recognizer.onerror = function (error) {
    // an error occurred
    console.log('onerror:', error);
  };

  recognizer.onspeechend = function () {
    // stopped detecting speech
    console.log('Speech has stopped being detected');
    scroller.innerHTML = 'wait';
  };

Scrolling based on the transcript

In order to scroll we’ll need to add the following code to the onresult event:

				
					// scroll according to detected command
var scroll = sortByConfidence(results).shift();
console.log('Final results:', results, scroll);
autoScroll(scroll);

This will sort the results by confidence values, get the best one, and send it to be executed if it’s a command matching our trigger words. Adding the next bit of code will achieve the behavior explained.

				
					/**
* Returns an list ordered by confidence values, descending order.
* @param {array} list - A list of objects containing the confidence and transcript values.
* @return array - Ordered list
*/
function sortByConfidence(list) {
  list.sort(function(a, b) {
    return a.confidence - b.confidence;
  }).reverse();
  var sortedResult = list.map(function(obj) {
    return obj.text;
  });
  return sortedResult;
}

/**
 * Execute the command if it matches the inputed.
 * 
 * @param {String} speech The command to evaluate
 */
function autoScroll (speech) {
  var body = document.body,
    html = document.documentElement;
  var pageHeight = Math.max(body.scrollHeight, body.offsetHeight, 
    html.clientHeight, html.scrollHeight, html.offsetHeight);
  var currentHeight = Math.max(body.scrollTop, html.scrollTop, window.pageYOffset)
  
  if (typeof speech === 'string' || speech instanceof String) {

    if (speech.indexOf('up') > -1) {
      console.log('Scrolling up...')
      window.scrollTo({
        top: currentHeight - 250,
        behavior: 'smooth'
      })
    } else if (speech.indexOf('down') > -1) {
      console.log('Scrolling down...')
      window.scrollTo({
        top: currentHeight + 250,
        behavior: 'smooth'
      })
    } else if (speech.indexOf('top') > -1) {
      console.log('Scrolling top...')
      window.scrollTo({
        top: 0,
        behavior: 'smooth'
      })
    } else if (speech.indexOf('bottom') > -1) {
      console.log('Scrolling bottom...')
      window.scrollTo({
        top: pageHeight,
        behavior: 'smooth'
      })
    }
  }
}

Now that this is done, you should be able to navigate up, down, to the top, or to the bottom of your website. Just be mindful that the transcriptions may not always be an exact match to what is being said, but they can still improve accessibility. As you can see, you can easily modify this to carry out other commands and build your own straightforward web assistant.

You can check the full demo code here.