Create a Language Translation Mobile App using React Native and Google APIs

In this blog, we are going to learn how to create a simple React Native based Language Translation Android app with Speech to Text and Text to Speech capabilities powered by Google APIs.

Installing dependencies:

Go to React Native Docs, select React Native CLI Quickstart and select your Development OS and Target OS -> Android, as we are going to build an Android application.

Follow the docs for installing dependencies and create a new React Native Application. Use the command line interface to generate a new React Native project called “Translator“:

react-native init Translator

You should see a folder named Translator created. Now open Translator folder with your favourite code editor and create a file called Translator.js. We need an input box for text that needs to be translated and another output section to display the translated text. We also need a select box that lists different languages to choose from for translation. Let’s create a json file, call it languages.json.

Go to languages.json file and copy the code below:

{
   "auto": "Auto Detect",
   "af": "Afrikaans",
   "sq": "Albanian",
   "am": "Amharic",
   "ar": "Arabic",
   "hy": "Armenian",
   "az": "Azerbaijani",
   "eu": "Basque",
   "be": "Belarusian",
   "bn": "Bengali",
   "bs": "Bosnian",
   "bg": "Bulgarian",
   "ca": "Catalan",
   "ceb": "Cebuano",
   "ny": "Chichewa",
   "zh-cn": "Chinese Simplified",
   "zh-tw": "Chinese Traditional",
   "co": "Corsican",
   "hr": "Croatian",
   "cs": "Czech",
   "da": "Danish",
   "nl": "Dutch",
   "en": "English",
   "eo": "Esperanto",
   "et": "Estonian",
   "tl": "Filipino",
   "fi": "Finnish",
   "fr": "French",
   "fy": "Frisian",
   "gl": "Galician",
   "ka": "Georgian",
   "de": "German",
   "el": "Greek",
   "gu": "Gujarati",
   "ht": "Haitian Creole",
   "ha": "Hausa",
   "haw": "Hawaiian",
   "iw": "Hebrew",
   "hi": "Hindi",
   "hmn": "Hmong",
   "hu": "Hungarian",
   "is": "Icelandic",
   "ig": "Igbo",
   "id": "Indonesian",
   "ga": "Irish",
   "it": "Italian",
   "ja": "Japanese",
   "jw": "Javanese",
   "kn": "Kannada",
   "kk": "Kazakh",
   "km": "Khmer",
   "ko": "Korean",
   "ku": "Kurdish (Kurmanji)",
   "ky": "Kyrgyz",
   "lo": "Lao",
   "la": "Latin",
   "lv": "Latvian",
   "lt": "Lithuanian",
   "lb": "Luxembourgish",
   "mk": "Macedonian",
   "mg": "Malagasy",
   "ms": "Malay",
   "ml": "Malayalam",
   "mt": "Maltese",
   "mi": "Maori",
   "mr": "Marathi",
   "mn": "Mongolian",
   "my": "Myanmar (Burmese)",
   "ne": "Nepali",
   "no": "Norwegian",
   "ps": "Pashto",
   "fa": "Persian",
   "pl": "Polish",
   "pt": "Portuguese",
   "ma": "Punjabi",
   "ro": "Romanian",
   "ru": "Russian",
   "sm": "Samoan",
   "gd": "Scots Gaelic",
   "sr": "Serbian",
   "st": "Sesotho",
   "sn": "Shona",
   "sd": "Sindhi",
   "si": "Sinhala",
   "sk": "Slovak",
   "sl": "Slovenian",
   "so": "Somali",
   "es": "Spanish",
   "su": "Sundanese",
   "sw": "Swahili",
   "sv": "Swedish",
   "tg": "Tajik",
   "ta": "Tamil",
   "te": "Telugu",
   "th": "Thai",
   "tr": "Turkish",
   "uk": "Ukrainian",
   "ur": "Urdu",
   "uz": "Uzbek",
   "vi": "Vietnamese",
   "cy": "Welsh",
   "xh": "Xhosa",
   "yi": "Yiddish",
   "yo": "Yoruba",
   "zu": "Zulu"
}

Translator.js (modify file), copy the code below:

import React, { Component } from 'react';
import { View, TextInput, StyleSheet, TouchableOpacity, TouchableHighlight, Text, Picker, Image } from 'react-native';
import Languages from './languages.json';

export default class Translator extends Component {

   constructor(props) {
       super(props);
       this.state = {
           languageFrom: "",
           languageTo: "",
           languageCode: 'en',
           inputText: "",
           outputText: "",
           submit: false,
       };
   }

   render() {
       return (
           <View style = {styles.container}>
               <View style={styles.input}>
                   <TextInput
                       style={{flex:1, height: 80}}
                       placeholder="Enter Text"
                       underlineColorAndroid="transparent"
                       onChangeText = {inputText => this.setState({inputText})}
                       value={this.state.inputText}
                   />
               </View>

               <Picker
               selectedValue={this.state.languageTo}
               onValueChange={ lang => this.setState({languageTo: lang, languageCode: lang})}
               >
                   {Object.keys(Languages).map(key => (
                       <Picker.Item label={Languages[key]} value={key} />
                   ))}
               </Picker>

               <View style = {styles.output}>
                  {/* output text displays here.. */}
               </View>
               <TouchableOpacity
                   style = {styles.submitButton}
                   onPress = {this.handleTranslate}
               >
                   <Text style = {styles.submitButtonText}> Submit </Text>
               </TouchableOpacity>
           </View>
       )
   }
}

const styles = StyleSheet.create({
   container: {
       paddingTop: 53
   },
   input: {
       flexDirection: 'row',
       justifyContent: 'center',
       alignItems: 'center',
       backgroundColor: '#fff',
       borderWidth: .5,
       borderColor: '#000',
       // height: 40,
       borderRadius: 5 ,
       margin: 10
   },
   output: {
       flexDirection: 'row',
       justifyContent: 'center',
       alignItems: 'center',
       backgroundColor: '#fff',
       borderWidth: .5,
       borderColor: '#000',
       borderRadius: 5 ,
       margin: 10,
       height: 80,
   },
   submitButton: {
       backgroundColor: '#7a42f4',
       padding: 10,
       margin: 15,
       borderRadius: 5 ,
       height: 40,
   },
   submitButtonText:{
       color: 'white'
   },
})

Now import your Translator.js in to App.js file.
Replace your App.js file with below code

import React, {Component} from 'react';
import {View} from 'react-native';
import Translator from './Translator';

export default class App extends Component {
   render() {
       return (
       <View>
           <Translator />
       </View>
       );
   }
}

Preparing the Android device

You will need an Android device to run your React Native Android app. This can be either a physical Android device, or more commonly, you can use an Android Virtual Device (AVD) which allows you to emulate an Android device on your computer (using Android Studio).

Either way, you will need to prepare the device to run Android apps for development.

Using a physical device

If you have a physical Android device, you can use it for development in place of an AVD by connecting it to your computer using a USB cable and following the instructions here.

If you are using virtual device follow this link.

Now go to command line and run react-native run-android inside your React Native app directory:

cd Translator
react-native run-android

If everything is set up correctly, you should see your new app running in your physical device or Android emulator shortly as below.

That’s great. We got the basic UI for our Translator app. Now we need to translate the input text into the selected language on submit. In React Native we have a library called react-native-power-translator for translating the text.

Let’s install the react-native-power-translator library. Go to the project root directory in command line and run the below command:

npm i react-native-power-translator --save

Usage:

import { PowerTranslator, ProviderTypes, TranslatorConfiguration, TranslatorFactory } from 'react-native-power-translator';

//Example
TranslatorConfiguration.setConfig('Provider_Type', 'Your_API_Key','Target_Language', 'Source_Language');

//Fill with your own details
TranslatorConfiguration.setConfig(ProviderTypes.Google, 'xxxx','fr');
  • PowerTranslator: a simple component to translate your texts.
  • ProviderTypes: type of cloud provider you want to use. There are two providers you can specify. ProviderTypes.Google for Google translate and ProviderTypes.Microsoft for Microsoft translator text cloud service.
  • TranslatorFactory: It returns a suitable translator instance, based on your configuration.
  • TranslatorConfiguration: It is a singleton class that keeps the translator configuration.

Now add the following code in your Translator.js file:

In the above code I’m using Google provider. You can use either Google or Microsoft provider.

Save all the files and run your app in the command line again and you can see a working app with translates text from one language to another as below.

import React, { Component } from 'react';
...
...
import { PowerTranslator, ProviderTypes, TranslatorConfiguration, TranslatorFactory } from 'react-native-power-translator';

export default class Translator extends Component {
...
...
render() {
       TranslatorConfiguration.setConfig(ProviderTypes.Google,’XXXX’, this.state.languageCode);
       return (
             ...
             ...
             ...
             <View style = {styles.output}>
                  {/* output text displays here.. */}
              {this.state.submit && <PowerTranslator  text={this.state.inputText} />}
              </View>

             ...
...
    
}
}

In the below image you can see the text that converted from English to French.

In Android devices you can download different language keyboards. So that you can translate your local language to other languages.

In Android devices you can download different language keyboards. So that you can translate your local language to other languages.

For speech to text we have a library called react-native-android-voice. Let’s install this library in our project.
Go to command line and navigate to project root directory and run the below command:

npm install --save react-native-android-voice

After installing successfully please follow the steps in this link for linking the library to your android project.

Once you completed linking libraries to your Android project, let’s start implementing it in our Translator.js file.

Let’s add a mic icon in our input box. When user taps on mic icon the speech feature will be enabled, there is a library called react-native-vector-icons. For installation follow the steps in this link.

In this project I’m using Ionicons icons, you can change it in iconFontNames in your android/app/build.gradle file as:

project.ext.vectoricons = [
   iconFontNames: [ 'Ionicons.ttf' ] // Name of the font files you want to copy
]

Now add the following code in Translator.js file.

import React, { Component } from 'react';
...
...
import Icon from "react-native-vector-icons/Ionicons";
import SpeechAndroid from 'react-native-android-voice';

export default class Translator extends Component {
constructor(props) {
      super(props);
      this.state = {
          languageFrom: "",
          ....
          ....
          micOn: false, //Add this
      };
      this._buttonClick = this._buttonClick.bind(this); //Add this
  }

...
async _buttonClick(){
       await this.setState({micOn: true})
       try{
           var spokenText = await SpeechAndroid.startSpeech("", SpeechAndroid.ENGLISH);
           await this.setState({inputText: spokenText});
           await ToastAndroid.show(spokenText , ToastAndroid.LONG);
       }catch(error){
           switch(error){
               case SpeechAndroid.E_VOICE_CANCELLED:
                   ToastAndroid.show("Voice Recognizer cancelled" , ToastAndroid.LONG);
                   break;
               case SpeechAndroid.E_NO_MATCH:
                   ToastAndroid.show("No match for what you said" , ToastAndroid.LONG);
                   break;
               case SpeechAndroid.E_SERVER_ERROR:
                   ToastAndroid.show("Google Server Error" , ToastAndroid.LONG);
                   break;
           }
       }
       this.setState({micOn: false})
   }

render() {
       TranslatorConfiguration.setConfig(ProviderTypes.Google,'XXXX', this.state.languageCode);
       return (
             <View style = {styles.container}>
              <View style={styles.input}>
                  <TextInput
                      ...
                      ...
                      ...
                  />
                  <TouchableOpacity onPress={this._buttonClick}>
                       {this.state.micOn ? <Icon size={30} name="md-mic" style={styles.micStyle}/> : <Icon size={30} name="md-mic-off" style={styles.micStyle}/>}
                   </TouchableOpacity>
              </View>
...
...
</View>
    )
}
}

const styles = StyleSheet.create({
  container: {
      paddingTop: 53
  },
...
...
...
  micStyle: {
      padding: 10,
      margin: 5,
      alignItems: 'center'
  }
})

After adding the code correctly, save all the changes and run your app. Now you can see a mic icon in the text input box which allows speech to text feature.

In the above code we are calling a function called _buttonClick() which contains speech to text logic. This will automatically start recognizing and adjusting for the English Language. You can use different languages for speech, you can check here for more information.

Now we successfully implemented speech to text to our Translator app. Let’s add text to speech feature which will turn the translated text into speech. For that we have a library called react-native-tts which converts text to speech.

Install react-native-tts in our project. Go to the command line and navigate to project root directory and run the following command:

npm install --save react-native-tts
react-native link react-native-tts

First command will install the library.
Second command will link the library to your android project.

Now add the following code in your Translator.js file

import React, { Component } from 'react';
...
...
import Icon from "react-native-vector-icons/Ionicons";
import SpeechAndroid from 'react-native-android-voice';

export default class Translator extends Component {
constructor(props) {
      super(props);
      this.state = {
          languageFrom: "",
          ...
          ...
          micOn: false, //Add this
      };
      this._buttonClick = this._buttonClick.bind(this); //Add this
  }


handleTranslate = () => {
       this.setState({submit: true})
       const translator = TranslatorFactory.createTranslator();
       translator.translate(this.state.inputText).then(translated => {
           // alert(translated)
           Tts.getInitStatus().then(() => {
               Tts.speak(translated);
           });
           Tts.stop();
       });
   }
...

render() {
         ...
    )
}
}

In the above code we have added the text to speech logic in handleTranslate function that called when submit button clicked.

Now our final Translator.js file will look like below:

import React, { Component } from 'react';
import { PowerTranslator, ProviderTypes, TranslatorConfiguration, TranslatorFactory } from 'react-native-power-translator';
import { View, TextInput, StyleSheet, TouchableOpacity, TouchableHighlight, Text, Picker, Image } from 'react-native';
import Icon from "react-native-vector-icons/Ionicons";
import Tts from 'react-native-tts';
import Languages from './languages.json';
import SpeechAndroid from 'react-native-android-voice';

export default class Translator extends Component {

   constructor(props) {
       super(props);
       this.state = {
           languageFrom: "",
           languageTo: "",
           languageCode: 'en',
           inputText: "",
           outputText: "",
           submit: false,
           micOn: false,
       };
       this._buttonClick = this._buttonClick.bind(this);
   }
   handleTranslate = () => {
       this.setState({submit: true})
       const translator = TranslatorFactory.createTranslator();
       translator.translate(this.state.inputText).then(translated => {
           Tts.getInitStatus().then(() => {
               Tts.speak(translated);
           });
           Tts.stop();
       });
   }
   async _buttonClick(){
       await this.setState({micOn: true})
       try{
           var spokenText = await SpeechAndroid.startSpeech("", SpeechAndroid.DEFAULT);
           await this.setState({inputText: spokenText});
           await ToastAndroid.show(spokenText , ToastAndroid.LONG);
       }catch(error){
           switch(error){
               case SpeechAndroid.E_VOICE_CANCELLED:
                   ToastAndroid.show("Voice Recognizer cancelled" , ToastAndroid.LONG);
                   break;
               case SpeechAndroid.E_NO_MATCH:
                   ToastAndroid.show("No match for what you said" , ToastAndroid.LONG);
                   break;
               case SpeechAndroid.E_SERVER_ERROR:
                   ToastAndroid.show("Google Server Error" , ToastAndroid.LONG);
                   break;
           }
       }
       this.setState({micOn: false})
   }

   render() {
       TranslatorConfiguration.setConfig(ProviderTypes.Google, 'XXXXXXXXX', this.state.languageCode);
       return (
           <View style = {styles.container}>
               <View style={styles.input}>
                   <TextInput
                       style={{flex:1, height: 80}}
                       placeholder="Enter Text"
                       underlineColorAndroid="transparent"
                       onChangeText = {inputText => this.setState({inputText})}
                       value={this.state.inputText}
                   />
                   <TouchableOpacity onPress={this._buttonClick}>
                       {this.state.micOn ? <Icon size={30} name="md-mic" style={styles.ImageStyle}/> : <Icon size={30} name="md-mic-off" style={styles.ImageStyle}/>}
                   </TouchableOpacity>
               </View>

               <Picker
               selectedValue={this.state.languageTo}
               onValueChange={ lang => this.setState({languageTo: lang, languageCode: lang})}
               >
                   {Object.keys(Languages).map(key => (
                       <Picker.Item label={Languages[key]} value={key} />
                   ))}
               </Picker>

               <View style = {styles.output}>
                   {this.state.submit && <PowerTranslator text={this.state.inputText} />}
                   {/* onTranslationEnd={this.textToSpeech} */}
               </View>
               <TouchableOpacity
                   style = {styles.submitButton}
                   onPress = {this.handleTranslate}
               >
                   <Text style = {styles.submitButtonText}> Submit </Text>
               </TouchableOpacity>
           </View>
       )
   }
}

const styles = StyleSheet.create({
   container: {
       paddingTop: 53
   },
   input: {
       flexDirection: 'row',
       justifyContent: 'center',
       alignItems: 'center',
       backgroundColor: '#fff',
       borderWidth: .5,
       borderColor: '#000',
       // height: 40,
       borderRadius: 5 ,
       margin: 10
   },
   output: {
       flexDirection: 'row',
       justifyContent: 'center',
       alignItems: 'center',
       backgroundColor: '#fff',
       borderWidth: .5,
       borderColor: '#000',
       borderRadius: 5 ,
       margin: 10,
       height: 80,
   },
   ImageStyle: {
       padding: 10,
       margin: 5,
       alignItems: 'center'
   },
   submitButton: {
       backgroundColor: '#7a42f4',
       padding: 10,
       margin: 15,
       borderRadius: 5 ,
       height: 40,
   },
   submitButtonText:{
       color: 'white'
   },
})

Make sure you have replaced ‘XXXXXX’ with your Google/Microsoft API-Key in TranslatorConfiguration in render method.

That’s it. Now we have a Language Translator, Speech to Text, Text to Speech features in our Translator application. We are ready to go now. Reload / Run your app and you can see a fully functional app.

When user taps on mic icon, an Android speech recognizer popup will be displayed as below.

If user didn’t speak or google doesn’t recognize the speech then it shows up as below:

Once Google recognizes speech then select a language to which you need to translate to and click the submit button, so that you would receive a translated text as speech.

That’s it folks!

This story is authored by Venu Vaka. Venu is a software engineer and machine learning enthusiast.

Processing Kinesis Data Streams with Spark Streaming


Solution Overview : In this blog, we are going to build a real time anomaly detection solution using Spark Streaming. Kinesis Data Streams would act as the input streaming source and the anomalous records would be written as Data Streams in DynamoDB.

Amazon Kinesis Data Streams (KDS) is a massively scalable and durable real-time data streaming service. KDS can continuously capture gigabytes of data per second from hundreds of thousands of sources such as website clickstreams, database event streams, financial transactions, social media feeds, IT logs, and location-tracking events.

Data Streams

The unit of data stored by Kinesis Data Streams is a data record. A data stream represents a group of data records.

For deep dive into Kinesis Data Streams, please go through these official docs.

Kinesis Data Streams Producers

A producer puts data records into Amazon Kinesis Data Streams. For example, a web server sending log data to a Kinesis Data Stream is a producer.

For more details about Kinesis Data Streams Producers, please go through these official docs.

Kinesis Data Streams Consumers

A consumer, known as an Amazon Kinesis Data Streams application, is an application that you build to read and process data records from Kinesis Data Streams.

For more details about Kinesis Data Streams Consumers, please go through these official docs.


Creating a Kinesis Data Stream

Step1. Go to Amazon Kinesis console -> click on Create Data Stream

Step2. Give Kinesis Stream Name and Number of shards as per volume of the incoming data. In this case, Kinesis stream name as kinesis-stream and number of shards are 1.

Shards in Kinesis Data Streams

A shard is a uniquely identified sequence of data records in a stream. A stream is composed of one or more shards, each of which provides a fixed unit of capacity.

For more about shards, please go through these official docs.

Step3. Click on Create Kinesis Stream

Kinesis Data Streams can be connected with Kinesis Data Firehoseto write the streamsinto S3.


Configure Kinesis Data Streams with Kinesis Data Producers

The Amazon Kinesis Data Generator (KDG) makes it easy to send data to Kinesis Streams or Kinesis Firehose.

While following this link, choose to Create a Cognito User with Cloud Formation.

After selecting the above option, we will navigate to the Cloud Formation console:

Click on Next and provide Username and Password for Cognito User for Kinesis Data Generator.

Click on Next and Create Stack.

CloudFormation Stack is created.

Click on Outputs tab and open the link

After opening the link, enter the usernameand password of Cognito user.

After Sign In is completed, select the RegionStream and configure the number of records per second. Choose record template as your requirement.

In this case, the template data format is

{{name.firstName}},{{random.number({“min”:10, “max”:550})}},{{random.arrayElement([“OK”,”FAIL”,”WARN”] )}}

The template data looks like the following

You can send different types of dummy data to Kinesis Data Streams.

Kinesis Data Streams with Kinesis Data Producers are ready. Now we shall build a Spark Streaming application which consumes data streams from Kinesis Data Streams and dumps the output streams into DynamoDB.


Create DynamoDB Tables To Store Data Frame

Go to Amazon DynamoDB console -> Choose Create Table and name the table, in this case, data_dump

In the same way, create another table named anomaly_data. Make sure Kinesis Data streams and DynamoDb tables are in the same region.

Spark Streaming with Kinesis Data Streams

Spark Streaming

Spark Streaming is an extension of the core Spark API that enables scalable, high-throughput, fault-tolerant stream processing of live data streams. Data can be ingested from many sources like Kafka, Flume, Kinesis, or TCP sockets, and can be processed using complex algorithms expressed with high-level functions like map, reduce, join and window.

For deep dive into Spark Streams, please go through docs.

In this case, the Scala programming language is used. Scala version is 2.11.12. Please install scala, sbt and spark.

Create a folder structure like the following

Kinesis-spark-streams-dynamo
| -- src/main/scala/packagename/object
| -- build.sbt
| -- project/assembly.sbt

In this case, the structure looks like the following

After creating the folder structure,

Please replace build.sbt file with the following code. The following code will add the required dependencies like spark, spark kinesis assembly, spark streaming and many more.

name := "kinesis-spark-streams-dynamo"

version := "0.1"

scalaVersion := "2.11.12"

libraryDependencies += "com.audienceproject" %% "spark-dynamodb" % "0.4.1"
libraryDependencies += "org.apache.spark" %% "spark-sql" % "2.4.3"
libraryDependencies += "com.google.guava" % "guava" % "14.0.1"
libraryDependencies += "com.amazonaws" % "aws-java-sdk-dynamodb" % "1.11.466"
libraryDependencies += "org.apache.spark" %% "spark-core" % "2.4.3"
libraryDependencies += "org.apache.spark" %% "spark-streaming" % "2.4.3"
libraryDependencies += "org.apache.spark" %% "spark-streaming-kinesis-asl" % "2.4.3"
libraryDependencies += "org.apache.spark" %% "spark-core" % "2.4.3"

assemblyMergeStrategy in assembly := {
case PathList("META-INF", xs @ _*) => MergeStrategy.discard
case x => MergeStrategy.first
}

Please replace assembly.sbt file with the following code. This will add the assembly plugin which can be used for creating the jar.

addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.14.9")

Please replace kinesis-spark-streams-dynamo file with the following code.

package com.wisdatum.kinesisspark

import com.amazonaws.auth.DefaultAWSCredentialsProviderChain
import org.apache.spark._
import org.apache.spark.streaming._
import com.amazonaws.services.kinesis.AmazonKinesis
import scala.collection.JavaConverters._
import org.apache.spark.storage.StorageLevel
import org.apache.spark.streaming.kinesis.KinesisInputDStream
import org.apache.spark.streaming.{Seconds, StreamingContext}
import com.amazonaws.services.kinesis.clientlibrary.lib.worker.InitialPositionInStream
import org.apache.spark.sql.SparkSession
import org.apache.spark.streaming.dstream.DStream
import com.amazonaws.regions.RegionUtils
import com.amazonaws.services.kinesis.AmazonKinesisClient
import org.apache.log4j.{Level, Logger}
import com.audienceproject.spark.dynamodb.implicits._

object KinesisSparkStreamsDynamo {
def getRegionNameByEndpoint(endpoint: String): String = {
val uri = new java.net.URI(endpoint)
RegionUtils.getRegionsForService(AmazonKinesis.ENDPOINT_PREFIX)
.asScala
.find(_.getAvailableEndpoints.asScala.toSeq.contains(uri.getHost))
.map(_.getName)
.getOrElse(
throw new IllegalArgumentException(s"Could not resolve region for endpoint: $endpoint"))
}

def main(args: Array[String]) {

val rootLogger = Logger.getRootLogger()
rootLogger.setLevel(Level.ERROR)

val conf = new SparkConf().setAppName("KinesisSparkExample").setMaster("local[*]")
val ssc = new StreamingContext(conf, Seconds(1))
println("Launching")
val Array(appName, streamName, endpointUrl, dynamoDbTableName) = args
println(streamName)
val credentials = new DefaultAWSCredentialsProviderChain().getCredentials()

require(credentials != null,
"No AWS credentials found. Please specify credentials using one of the methods specified " +
"in http://docs.aws.amazon.com/AWSSdkDocsJava/latest/DeveloperGuide/credentials.html")
val kinesisClient = new AmazonKinesisClient(credentials)
kinesisClient.setEndpoint(endpointUrl)
val numShards = kinesisClient.describeStream(streamName).getStreamDescription().getShards().size
println("numShards are " + numShards)

val numStreams = numShards

val batchInterval = Milliseconds(100)

val kinesisCheckpointInterval = batchInterval

val regionName = getRegionNameByEndpoint(endpointUrl)

val anomalyDynamoTable = "data_anomaly"

println("regionName is " + regionName)

val kinesisStreams = (0 until numStreams).map { i =>
KinesisInputDStream.builder
.streamingContext(ssc)
.streamName(streamName)
.endpointUrl(endpointUrl)
.regionName(regionName)
.initialPositionInStream(InitialPositionInStream.LATEST)
.checkpointAppName(appName)
.checkpointInterval(kinesisCheckpointInterval)
.storageLevel(StorageLevel.MEMORY_AND_DISK_2)
.build()
}

val unionStreams = ssc.union(kinesisStreams)

val inputStreamData = unionStreams.map { byteArray =>
val Array(sensorId, temp, status) = new String(byteArray).split(",")
StreamData(sensorId, temp.toInt, status)
}

val inputStream: DStream[StreamData] = inputStreamData

inputStream.window(Seconds(20)).foreachRDD { rdd =>
val spark = SparkSession.builder.config(rdd.sparkContext.getConf).getOrCreate()
import spark.implicits._

val inputStreamDataDF = rdd.toDF()
inputStreamDataDF.createOrReplaceTempView("hot_sensors")

val dataDumpDF = spark.sql("SELECT * FROM hot_sensors ORDER BY currentTemp DESC")
dataDumpDF.show(2)
dataDumpDF.write.dynamodb(dynamoDbTableName)

val anomalyDf = spark.sql("SELECT * FROM hot_sensors WHERE currentTemp > 100 ORDER BY currentTemp DESC")
anomalyDf.write.dynamodb(anomalyDynamoTable)
}

// To make sure data is not deleted by the time we query it interactively
ssc.remember(Minutes(1))

ssc.start()
ssc.awaitTermination()
}
}
case class StreamData(id: String, currentTemp: Int, status: String)

appName: The application name that will be used to checkpoint the Kinesis sequence numbers in the DynamoDB table.

  1. The application name must be unique for a given account and region.
  2. If the table exists but has incorrect checkpoint information (for a different stream, or old expired sequenced numbers), then there may be temporary errors.

kinesisCheckpointInterval
The interval (e.g., Duration(2000) = 2 seconds) at which the Kinesis Client Library saves its position in the stream. For starters, set it to the same as the batch interval of the streaming application.

endpointURL:
Valid Kinesis endpoints URL can be found here.

For more details about building KinesisInputDStream, please go through the documentation.

Configure AWS Credentials using environment variables or using aws configure command.

Make sure all the resources are under the same account and region. Region of CloudFormation Stack that was created is in us-west-2 even though all the resources are in another region, this would not affect the process.


Building Executable Jar

  • Open Terminal -> Go to project root directory, in this case 
    kinesis-spark-streams-dynamo
  • Run sbt assembly

The jar has been packaged into project root directory/target/scala-2.11/XXXX.jar. Name of the jar is the name that provided in build.sbt file.

Run the Jar using spark-submit

  • Open Terminal -> Go to Spark bin directory
  • Run the following command, and it looks like
./bin/spark-submit ~/Desktop/kinesis-spark-streams-dynamo/target/scala-2.11/kinesis-spark-streams-dynamo-assembly-0.1.jar appName streamName endpointUrl dynamoDbTable

To know more about how to submit applications using spark-submit, please review this.

Arguments that are passed are highlighted in the above highlighted blue box. Place the arguments as needed.

Read Kinesis Data Streams in Spark Streams

  1. Go to Amazon Kinesis Data Generator-> Sign In using Cognito user
  2. Click on Send Data, it starts sending data to Kinesis Data Streams

Data would be sent to Kinesis Data Stream, in this case, kinesis-stream, it looks like this.

Monitoring Kinesis Data Streams

Go to Amazon Kinesis Console -> Choose Data streams -> Select created Data Stream -> click on Monitoring

The terminal looks like the following when it starts receiving the data from Kinesis Data Streams

The data_dump table has the whole data that is coming from Kinesis Data Streams. And the data in the data_dump table looks like

The data_anomaly table has data where currentTemp is greater than 100. Here the anomaly is temperature greater than 100. And the data in the data_anomaly table looks like

I hope this article was helpful in setting up Kinesis Data Streams that are consumed and processed using Spark Streaming and stored in DynamoDB.

This story is authored by P V Subbareddy. He is a Big Data Engineer specializing on AWS Big Data Services and Apache Spark Ecosystem.

A Beginner’s Guide to Airflow

Airflow is used to create code pipeline where we can schedule and monitor our workflows. A workflow can be a collection of tasks to be executed like a flowchart.

It is like an orchestra conductor that controls all different data processing tools/tasks under one roof.

Why Airflow?

  • Open source.
  • Can handle upstream/downstream in an elegant way.
  • Jobs can pass parameters to other jobs downstream.
  • Ease of deployment of workflow.
  • Easy to reprocess historical jobs by date or rerun for specific intervals.
  • Integration with a lot of infrastructure.
  • Job testing through airflow itself.
  • Accessibility of log files and other meta-data through the web GUI.
  • Data sensors to trigger Directed Acyclic Graph (DAG) when data arrives.

Airflow Architecture:

Metadata: It stores the state of tasks and workflow.
Scheduler: It is a user DAG, with the state of tasks in metadata database. It decides what needs to execute.
Executor: It is a message queuing process. Decides which worker will execute each task.

Setting up airflow:

Use the following commands to install airflow

$ sudo pip install apache-airflow

You can install extra features like

$ sudo pip install apache-airflow[postgres,s3]

Airflow requires database to be initiated before you run tasks

$ airflow initdb

After installation the folder structure will be something like this.

If you don’t see ‘dags’ folder then you can create by yourself and name it ‘dags’. You shall dump all your python task files in that folder.

To start the web server

$ Airflow webserver -p 8080

Goto https://localhost:8080 to see airflow GUI. You should see something like this:

Airflow relies on 4 cores:

  1. DAG.
  2. Tasks.
  3. Scheduler.
  4. X-com.

Directed Acyclic Graph (DAG):

It is a collection of all tasks which you want to run in an organized way that shows their relationships and dependencies.

Basically a DAG is just a python file, which is used to organise tasks and set their execution content. DAGs do not perform any actual computation. Basically a DAG task contains 2 type of things:

  1. Operators
    It determines what actually gets done. It triggers a certain action like running a bash command, executing a python function, etc.
    There are different kinds of operators like:
    Bash operator: Executes a bash command.
    Python operator: Calls an arbitrary python function.
    Hive operator: Executes hql code or hive script in specific hive database.
    Bigquery operator: Executes Google BigQuery SQL query in a specific BigQuery database.
  2. Sensors : A certain type of operator that will keep running until a certain criteria is met.

Tasks :
Tasks are the elements that actually “do work” which we want to be performed. It is our job to write the configuration and organise tasks in specific order to create data pipeline.

Once the operator is instantiated, it is referred to as a “task”. An operator describes single task in a workflow

Scheduler :
It monitors all tasks and all dags. And trigger the task instance whose dependencies have met.
We create new python file in dags folder
Importing packages in our python file:

import datetime as dt
from airflow import DAG
from airflow.operators.bash_operator import BashOperator
from airflow.operators.python_operator import PythonOperator

Next setting default arguments

default_args = {
    'owner': 'airflow',
    'depends_on_past': False,
    'start_date': dt.datetime(2019, 6, 17, 8, 40, 00),
    'retries': 1,
}

Depends_on_past: we will be in a situation where  a task t1 is dependent on its previous execution. In that case we will use this.

Start_date: starting date of execution
Defining our tasks, dags, functions

def greet():
    print("Hello how are you ?")
    return 'greeted'

def response():
    return "im fine what about you?"

with DAG('YourDagName', default_args=default_args, schedule_interval='@daily',) as dag:
    opr_hello = BashOperator(task_id='say_Hi',bash_command='echo "Hi!!"')
    opr_greet = PythonOperator(task_id='greet', python_callable=greet)
    opr_sleep = BashOperator(task_id='sleep_me', bash_command='sleep 2')
    opr_respond = PythonOperator(task_id='respond', python_callable=response)

In this, I am using 2 simple functions and with dag passing my default arguments and also schedule time of execution.

After that we created 4 tasks in which 2 are python and other 2 are bash. If we want to execute a python function we need to use python operator and also the same with the bash operator. Python operator will take at least 2 arguments which is task id and python callable and for bash, task id and bash command.

Defining task dependencies or order in which the tasks should be executed in.

Task dependencies are set using

  • The set_upstream and set_downstream operator.
  • The bitshift operator <<  and >>.

t1.set_downstream(t2)
t1>>t2

The above 2 lines mean the same. This means t2 will depend on t1 to run successfully.

opr_hello >> opr_greet >> opr_sleep >> opr_respond

If you wrap up the whole code in one file and put that file in dags folder then you can see your dag in Airflow GUI. If you click on your DAG you will see tree view and many other options to view your data in different forms.

Dagrun are dags that runs at certain time.
TaskInstances are task belongs to dagrun.
Each dagrun and task instance is associated with an entry in airflow’s metadata database that logs their state(eg: queued, running, failed, skipped, etc).
X-com: XCom which means cross communication allow airflow tasks to send and receive data/status. XCom data are stored in the airflow database in the form of key-value pairs.

That’s all guys. Thanks for the read!

This story is co-authored by Santosh Kumar and PV Subbareddy. Santosh is a Software Engineer specializing on Cloud Services and DevOps. Subbareddy is a Big Data Engineer specializing on AWS Big Data Services and Apache Spark Ecosystem.

Building a Simple React Application using Redux

In this article I will be building a simple react app that uses redux to manage its state. If you are a complete beginner, I recommend first reading my A Beginners Guide to Understanding Redux article then following this article. For this article, I have built a simple react app that renders a dummy signup form and a table which displays the list of employee details (dummy data). We shall integrate it with redux but first do the initial setup to follow along.

Initial Setup

Please clone this specific branch ‘without-redux’ from my GitHub repository.

git clone -b without-redux --single-branch https://github.com/koushik-bitzop/newempdetails.git

Run the above command on your terminal, this should create a folder and clone all the required code files from my repository.
To get started install the dependencies:

sudo npm install

Start the application server:

sudo npm start

You should see the app running on your localhost, like this:

The code files that you have cloned and the below are the same.

App.js

import React, {Component} from 'react';
import Header from './components/Header';
import Routes from './components/Routes';
import { BrowserRouter as Router} from 'react-router-dom';

class App extends Component {
 state = {  }
 render() {
   return (
   <Router>
       <Header/>
       <div className="row">
           <Routes />
       </div>
   </Router>
   );
 }
}

export default App;

Routes.js

import React, {Component} from 'react';
import {Switch} from 'react-router';
import {Route} from 'react-router-dom';
import FormdataTable from './FormdataTable';
import SignupForm from './SignupForm';

class Routes extends Component {
   render() {
       return ( 
           <Switch>
               <Route path={'/'} exact component={SignupForm } ></Route>
               <Route path={'/viewlist'} exact component={FormdataTable}></Route>
           </Switch>
       );
   }
}

export default Routes;

Header.js

import React,{Component} from 'react';
import {Link, NavLink} from 'react-router-dom';

class Header extends Component{
   render(){
       return(
           <nav className="navbar navbar-expand-lg navbar-light bg-light">
                  
               <Link to="/" className="navbar-brand" href="#"><h1>Wisdatum</h1></Link>

               <div className="collapse navbar-collapse" id="navbarNav">
                   <ul className="navbar-nav">
                       <li className="nav-item active">
                           <NavLink to="/" className="nav-link" href="#">Add new </NavLink>
                       </li>
                       <li className="nav-item">
                           <NavLink to="/viewlist" className="nav-link" href="#">View list</NavLink>
                       </li>
                   </ul>
               </div>

           </nav>
       );
   }
}

export default Header;

SignupForm.js

import React, {Component} from 'react';
import {Form, FormGroup, FormControl, FormLabel, Button} from 'react-bootstrap';

class SignupForm extends Component {
   constructor(props) {
   super(props);
   this.state = {
       firstname: '',
       lastname: '',
       email: '',
       mobile: '',
       city: ''
   };

   // This binding is necessary to make `this` work in the callback
   this.handleSubmit = this.handleSubmit.bind(this);
   }
handleSubmit(){
   let formdata = {
       firstname: this.state.firstname,
       lastname: this.state.lastname,
       email: this.state.email,
       mobile: this.state.mobile,
       city: this.state.city
   };
   console.log("submitted",formdata);
   this.setState({
       firstname: '',
       lastname: '',
       email: '',
       mobile: '',
       city: ''
   });
}
   render() {
       return (
           <div className="col-md-4 offset-md-4">
               <Form>
                   <h2 style={{"textAlign":"center", "marginTop":"20px"}}>Enter Employee Details</h2>
                   <hr/>
                   <FormGroup>
                       <FormLabel>Firstname</FormLabel>
                       <FormControl
                           type="text"
                           name="firstname"
                           placeholder="Firstname"
                           onChange={e => {
                               this.setState({[e.target.name]:e.target.value});
                           }}
                           value = {this.state.firstname}
                       />
                   </FormGroup>
              
                   <FormGroup>
                       <FormLabel>Lastname</FormLabel>
                       <FormControl
                           type="text"
                           name="lastname"
                           placeholder="Lastname"
                           onChange={e => {
                               this.setState({[e.target.name]:e.target.value});
                           }}
                           value = {this.state.lastname}
                       />
                   </FormGroup>

                   <FormGroup>
                       <FormLabel>Email</FormLabel>
                       <FormControl
                           type="text"
                           name="email"
                           placeholder="Email"
                           onChange={e => {
                               this.setState({[e.target.name]:e.target.value});
                           }}
                           value = {this.state.email}
                       />
                   </FormGroup>

                   <FormGroup>
                       <FormLabel>Mobile</FormLabel>
                       <FormControl
                           type="text"
                           name="mobile"
                           placeholder="Mobile"
                           onChange={e => {
                               this.setState({[e.target.name]:e.target.value});
                           }}
                           value = {this.state.mobile}
                       />
                   </FormGroup>

                   <FormGroup>
                       <FormLabel>City</FormLabel>
                       <FormControl
                           type="text"
                           name="city"
                           placeholder="City/Village"
                           onChange={e => {
                               this.setState({[e.target.name]:e.target.value});
                           }}
                           value = {this.state.city}
                   />
                   </FormGroup>
                   <Button onClick={this.handleSubmit}>Submit</Button>
               </Form>
           </div>
        );
   }
}
export default SignupForm;

FormdataTable.js

import React,{Component} from 'react';
import empdata from '../data/employeedata.json';

class FormdataTable extends Component {
   state = {  }
   render() {
       console.log(empdata);
       return (
           <div className="col-md-10 offset-md-1">
               <h2 style={{"textAlign":"center", "marginTop":"20px", "marginBottom":"20px"}}>New Employee Details</h2>
               <table className="table table-hover">
                   <thead>
                       <tr>
                           <th scope="col">#</th>
                           <th scope="col">Fist Name</th>
                           <th scope="col">Last Name</th>
                           <th scope="col">Email</th>
                           <th scope="col">Mobile</th>
                           <th scope="col">City</th>
                           <th scope="col">Options</th>
                       </tr>
                   </thead>
                   <tbody>
                       {
                       empdata.map((emp, index) => {
                       return(
                           <tr key={index}>
                                   <td>{index+1}</td>
                                   <td>{emp.firstname}</td>
                                   <td>{emp.lastname}</td>
                                   <td>{emp.email}</td>
                                   <td>{emp.mobile}</td>
                                   <td>{emp.city}</td>
                                   <td>
                                       <button
                                           type="button"
                                           onClick={()=>this.handleEdit(index)}
                                           className="btn btn-sm btn-primary">Edit
                                       </button>
                                       {" | "}
                                       <button
                                           type="button"
                                           onClick={()=>this.handleDelete(index)}
                                           className="btn btn-sm btn-danger">Delete
                                       </button>
                                   </td>
                           </tr>
                       )})
                       }
                   </tbody>
               </table>
           </div>
       );
   }
}
export default FormdataTable;

Before we begin integrating with redux, please review and understand the code above. I have used bootstrap for styling and react-router for creating the navigation/routes in the application.

Integrating with redux:

The above diagram depicts redux workflow in a nutshell, UI triggers actions like form-submit, edit & delete in table. Each action represents an action object, these action objects are then dispatched to Reducer function which updates the Store. Store contains the State. All the components get their data from the store and will be updated with new state. This new state will re-render and defines the new UI changes, repeats the cycle.

Redux is external to react we shall need redux & react-redux library to connect to redux with react.

npm install redux react-redux --save

We shall go step by step.

Step1: Create a root reducer function

Reducer function always talks to the store, updates the store and creates the store.

allreducer.js (create in src folder)

import empdata_json from '../data/employeedata.json';

function empdata(state = empdata_json, action){
    switch(action.type){
        default:
            return state;
    }
}

export default empdata;

The above reducer function accepts two arguments current state and an action object. Initial values of this state is set to dummy data imported from employeedata.json file. Whenever this function is invoked, based on the action type, the corresponding block is executed in the switch. Though the logic of the blocks may differ, they all pretty much do the same. Take the payload and return new state to update the store.

Step2: Create the store, make it available to whole tree

We have to import the above reducer function and use createStore method available in redux to create the store.

To make this store available to the whole app component tree we wrap the whole nested app component tree in Provider component available in react-redux store as props.

The <Provider /> makes the Redux store available to any nested components that have been wrapped in the connect() function. We will learn about connect() in the next step. Since any React component in a React Redux app can be connected, most applications will render a <Provider> at the top level, with the entire app’s component tree inside of it.

Normally, you can’t use a connected component unless it is nested inside of a <Provider>.

index.js (modify this file)

import React from 'react';
import ReactDOM from 'react-dom';
import './index.css';
import App from './App';
import * as serviceWorker from './serviceWorker';
import empdatareducer from '../src/allreducer';
import {createStore} from 'redux';
import {Provider} from 'react-redux';

const store = createStore(empdatareducer);
ReactDOM.render(<Provider store={store}><App /></Provider>, document.getElementById('root'));

// If you want your app to work offline and load faster, you can change
// unregister() to register() below. Note this comes with some pitfalls.
// Learn more about service workers: https://bit.ly/CRA-PWA
serviceWorker.unregister();

Step3: Connecting with redux

Now the store is available to whole component tree, components have to be connected to redux and be mapped with the store to get data(as props). The react-redux library comes with a function called connect, which is how you can feed data from Redux’s store into your React components and also dispatch actions to reducer function. The connect function is commonly passed 1 or 2 arguments:

First, a mapStateToProps function that takes out pieces of state out of Redux and assigns them to props that your React component will use.

And often a second argument: a mapDispatchToProps function which binds action creator functions(return action object) with dispatch, So you can just write props.actionName() and you don’t have to write dispatch: actionObject, for every event or action. The dispatch is invoked as soon as action creator returns an action object.

Remember, you can’t dispatch an action to a specific reducer function, Whenever there is a dispatch, it is received by all reducers, only those blocks where action type is matched are executed.

As our FormdataTable.js component requires the store data, we map its props with store.

FormdataTable.js (modify this file)

import React,{Component} from 'react';
import {connect } from 'react-redux';

class FormdataTable extends Component {
  state = {  }
  render() {
      //console.log(this.props.empdata);
      return (
          <div className="col-md-10 offset-md-1">
              <h2 style={{"textAlign":"center", "marginTop":"20px", "marginBottom":"20px"}}>New Employee Details</h2>
              <table className="table table-hover">
                  <thead>
                      <tr>
                          <th scope="col">#</th>
                          <th scope="col">Fist Name</th>
                          <th scope="col">Last Name</th>
                          <th scope="col">Email</th>
                          <th scope="col">Mobile</th>
                          <th scope="col">City</th>
                          <th scope="col">Options</th>
                      </tr>
                  </thead>
                  <tbody>
                      {
                      this.props.empdata.map((emp, index) => {
                      return(
                          <tr key={index}>
                                  <td>{index+1}</td>
                                  <td>{emp.firstname}</td>
                                  <td>{emp.lastname}</td>
                                  <td>{emp.email}</td>
                                  <td>{emp.mobile}</td>
                                  <td>{emp.city}</td>
                                  <td>
                                      <button
                                          type="button"
                                          onClick={()=>this.handleEdit(index)}
                                          className="btn btn-sm btn-primary">Edit
                                      </button>
                                      {" | "}
                                      <button
                                          type="button"
                                          onClick={()=>this.handleDelete(index)}
                                          className="btn btn-sm btn-danger">Delete
                                      </button>
                                  </td>
                          </tr>
                      )})
                      }
                  </tbody>
              </table>
          </div>
      );
  }
}
function mapStateToProps(state){
     return {
       empdata : state
   };
}
export default connect(mapStateToProps,null)(FormdataTable);

Our signup formdata is to be updated with the store, this is an action and should be dispatched. We will first write an action creator function and then bind it with dispatch using bindActionCreators available in redux. Only then we could dispatch it to the reducer to update with the store.

allactions.js (create in src folder)

export function addNewEmp(payload){
   const action = {
       type: "ADD_NEW_EMP",
       payload
   }
   return action;
}

In the above case payload is our formdata. Let us bind this action creator with dispatch and connect to redux to dispatch our action object containing payload(formdata).

SignupForm.js (modify this file)

import React,{Component} from 'react';
import {Form, FormGroup, FormControl, FormLabel, Button} from 'react-bootstrap';

import {connect} from 'react-redux';
import {bindActionCreators} from 'redux';
import {addNewEmp} from '../allactions';

class SignupForm extends Component {
  constructor(props) {
  super(props);
  this.state = {
      firstname: '',
      lastname: '',
      email: '',
      mobile: '',
      city: ''
  };

  // This binding is necessary to make `this` work in the callback
  this.handleSubmit = this.handleSubmit.bind(this);
  }
handleSubmit(){
  let formdata = {
      firstname: this.state.firstname,
      lastname: this.state.lastname,
      email: this.state.email,
      mobile: this.state.mobile,
      city: this.state.city
  };
  this.props.addNewEmp(formdata);
  console.log("submitted",formdata);
  this.setState({
      firstname: '',
      lastname: '',
      email: '',
      mobile: '',
      city: ''
  });
}
  render() {
      return (
          <div className="col-md-4 offset-md-4">
              <Form>
                  <h2 style={{"textAlign":"center", "marginTop":"20px"}}>Enter Employee Details</h2>
                  <hr/>
                  <FormGroup>
                      <FormLabel>Firstname</FormLabel>
                      <FormControl
                          type="text"
                          name="firstname"
                          placeholder="Firstname"
                          onChange={e => {
                              this.setState({[e.target.name]:e.target.value});
                          }}
                          value = {this.state.firstname}
                      />
                  </FormGroup>
            
                  <FormGroup>
                      <FormLabel>Lastname</FormLabel>
                      <FormControl
                          type="text"
                          name="lastname"
                          placeholder="Lastname"
                          onChange={e => {
                              this.setState({[e.target.name]:e.target.value});
                          }}
                          value = {this.state.lastname}
                      />
                  </FormGroup>

                  <FormGroup>
                      <FormLabel>Email</FormLabel>
                      <FormControl
                          type="text"
                          name="email"
                          placeholder="Email"
                          onChange={e => {
                              this.setState({[e.target.name]:e.target.value});
                          }}
                          value = {this.state.email}
                      />
                  </FormGroup>

                  <FormGroup>
                      <FormLabel>Mobile</FormLabel>
                      <FormControl
                          type="text"
                          name="mobile"
                          placeholder="Mobile"
                          onChange={e => {
                              this.setState({[e.target.name]:e.target.value});
                          }}
                          value = {this.state.mobile}
                      />
                  </FormGroup>

                  <FormGroup>
                      <FormLabel>City</FormLabel>
                      <FormControl
                          type="text"
                          name="city"
                          placeholder="City/Village"
                          onChange={e => {
                              this.setState({[e.target.name]:e.target.value});
                          }}
                          value = {this.state.city}
                  />
                  </FormGroup>
                  <Button onClick={this.handleSubmit}>Submit</Button>
              </Form>
          </div>
       );
  }
}

function mapDispatchToProps(dispatch){
   return bindActionCreators({addNewEmp},dispatch)
}
export default connect (null, mapDispatchToProps)(SignupForm)

That simple, we have successfully managed state with redux, now let’s test it.

Now, please try to implement the delete functionality by yourself like shown below

Before deletion:

After deletion:

If you could do it you are doing great. If not please feel free to refer my code files below.

https://github.com/koushik-bitzop/newempdetails/commit/cd9712a89c4de7625cecbd6e494199f0ad5dd20f?diff=split

I hope this article was helpful to you in learning redux. Thanks for the read!

This story is authored by Koushik. He is a software engineer and a keen data science and machine learning enthusiast.

Case Study – Apache Log Analysis using Logstash-Elasticsearch-Kibana (ELK) Stack

In the previous blog,  we loaded apache log data into Elasticsearch with Logstash.  Now our goal  is to read this data into Kibana to help us run some analytics use cases. Quick note – the entire log file will not only be read into Elasticsearch but will also be displayed onto the standard output. It takes about 3-4 minutes to display the entire log file. ( remove “ignore_older => 0” from the config file to read older logs). To cross check if the data has been loaded and indices have been created in Elasticsearch,  type the following in the browser http://localhost:9200/_cat/indices ( replace “localhost” by the server name that Elasticsearch is running on). This will show all the indexes created, logstash will create indexes that start as logstash-*. Once you find logstash indexes, its time to get them into Kibana.

Kibana accesses Elasticsearch indices using “index patterns”.  We specify the  pattern of the index name we are searching for, and create an index pattern for Kibana to fetch the data from Elasticsearch. If the difference between index name and index pattern is not immediately clear, please wait till we create index patterns in Kibana.

Log into Kibana from browser using http://localhost:5601/ (replace “localhost” by IP/name of the server Kibana is running on). Kibana home page will open up, if it doesn’t please check that Elasticsearch and Kibana are up and running on the server. In case you need to troubleshoot, please check the earlier post on troubleshooting kibana.

From Kibana home page (left side Menu), click on “Management->Index Patterns-> “+Create Index Pattern button. The following page opens up

In the Index Patterns field, type “logstash-*” and Kibana will display all the indexes in Elasticsearch whose name matches the given pattern. Click on “next” and choose “@timestamp” so we can filter our data by time.

Click on “Create index pattern” button and an index pattern will be created with all the fields being displayed

With index pattern created, we are ready to use apache_log data in Kibana. Click on “Discover” from left side Menu and choose logstash* from the drop down and all the data from the log will be displayed here. If you are using the same log as mine, initially you will not be able to see any data. That’s because the filter field on the right corner of the page will default to time “last 15 minutes”. Since, this log is an old one, click on the time and choose “Quick ” and then select “last 5 years” option and bingo! the log data shows up on the screen.

If the above setting is not clear, please check the screenshot below

In case you need a refresher on Kibana visualizations, check this out. You can use Discoverer to get a pie-chart of the different requests coming in. So let’s say you want to analyze the various request keywords for your web server traffic. This visualization shows the various requests (aggregate by “Terms” and field is “request.keyword”) that hit the apache server.

How does it help? Well, for websites with huge volume of traffic, this helps understand the pattern of resource consumption. Common questions that we can answer:

  • Is the new blog post garnering all the attention?
  • Is it the new pair of shoes that are being seen so frequently?
  • Are people interested in self help books or easy comedy?

Another use-case may be to analyze the HTTP response codes of the web server. We are pulling up the same pie-chart for the different response codes server has generated.

What do we infer from this visual? Well, is the web server able to provide a proper response as expected? Are we returning too many ‘page not found’ errors? Why do we have too many ‘authentication failed’ errors? Are a majority of users really forgetting their passwords or something malicious is going on?

In addition, we can also create dashboard level metrics for error code like so.

For time-series analysis, we need to click on Visualise->Time series->Visual Builder. Here, the screen is divided in two horizontal planes. In the bottom plane, choose “Panel Options” tab and type the index pattern as “logstash*” and the time series will show up as a graph like so

It shows the access rate for the given time period. Since, most of the log data is around the same time, let’s change the date (from Last 5 years) to around May 18, 2015 (we can change the date as below)

and the output changes like shown below. Here, the log data has been generated for every 5 times for the particular day selected.

Let’s say this is an access log for an online shopping website and a lot of users have accessed this on May 18 2015. Why? Probably because the company has come up with certain discounts or launched a new product.  If this data is considered in real time, we can visualize the number of people accessing the server currently. If its the festive season, and we are expecting a lot of traffic, we can also foresee when the servers will be stretched based on the historical pattern and act accordingly.  It also helps in marketing and sales: a lot of people are currently logged in, should I add an additional 5% discount to amp up my sales immediately?

If it’s a banking institution that the system is designed for, we can ask questions such as: Why are so many users trying to access the system at same time? Are they really bonafide users or some malware trying to break into the server? By installing a few plugins, we can also visualize which geographic area the requests are originating from . So, we will even get to know if requests are being made from a certain place. These visualizations are really powerful and user friendly and one doesn’t need to have a lot of technical expertise to use Kibana.

That’s about it on this one. I hope the blog posts in this series on ELK stack have been useful for the interested folks to sharpen their data analytics and visualizations chops.  

P.S. : some quick troubleshooting tips on Kibana index patterns:

What if the “create index pattern” page is displaying loading wheel indefinitely on clicking “create index pattern”?

Since Kibana opens in a webpage, we can use browser troubleshooting to see what’s wrong on our page. Right click on the page->Inspect->choose console tab. This shouldn’t show any errors, there can be log messages but not error. I had the forbidden error in red. On trying to refresh any index pattern, this error came up on screen Config: Error 403 Forbidden: blocked by: [FORBIDDEN/12/index read-only / allow delete (api)];

This implies, the indexes are all read-only and hence no changes are possible. This happens when kibana runs out of space on the server it’s installed on. We ran out of disk space and had to add more space. Kibana forces read-only on the indexes but does not get them back to normal state in an out-of-space situation. We had to manually move them out like so

curl -XPUT http://localhost:9200/_all/_settings -d '{"index.blocks.read_only_allow_delete": null}'

(localhost to be replaced by your server IP/name). On completing successfully, it displays {“acknowledged:true”} . You can refresh Kibana from the webpage and try and create the index patterns now.

Introduction and Installation of Logstash

So far in this series on ELK (Elasticsearch-Logstash-Kibana) stack, we have talked about Elasticsearch and Kibana(here and here). It’s about time we look at the third pillar of this stack – Logstash.

So, what is Logstash?

Logstash is an open source data collection engine with real-time pipelining capabilities.  This implies that Logstash can dynamically put data from different sources together and clean/filter the data and send it to any output destinations we need. It’s a pluggable architecture that allows great flexibility. Logstash can take a wide variety of unstructured data and then clean it and transform it to gain near real-time insights immediately.

What is Grok?

As mentioned in Logstash documentation -“Grok is the bread and butter of Logstash filters and is used ubiquitously to derive structure out of unstructured data.”  A bit more details on Grok will help you understand Logstash better, please see :

https://www.elastic.co/guide/en/logstash/6.4/plugins-filters-grok.html

We will now jump directly into Logstash installation.

Run the following at command line of your ubuntu server

$sudo apt-get update && sudo apt-get install logstash

Information about updated versions of packages is checked with “update” command and hence is run before every installation.

Once the installation is complete, we can test Logstash with a very basic Logstash pipeline (the concept of Logstash pipeline is like so : {inputs->filters->outputs}).

$/usr/share/logstash/bin/logstash -e 'input { stdin { } } output { stdout {} }'

The input is stdin and the output is stdout.  Once Logstash is up and running, we can enter “hello world” on the command line and Logstash prints it back with the timestamp, version and host. This confirms that the setup is working.

If you prefer to test with a sample log file, then we can do so by  downloading apache log file and creating the configuration file required for Logstash.

Elasticsearch documentation provides sample apache log files to test Logstash. To download, type the following at command prompt

$wget https://raw.githubusercontent.com/elastic/examples/master/Common%20Data%20Formats/apache_logs/apache_logs

The file can also be accessed from  https://github.com/elastic/examples/tree/master/Common%20Data%20Formats/apache_logs

The configuration file can be created like so

$sudo vi /etc/logstash/conf.d/logstash.conf

This will open an empty file. Press “Esc and then i” to get into “insert” mode and copy the following:

input {
       file {
           path => "/home/<YOUR_USER_NAME"/apache_logs"
           start_position => "beginning"
           ignore_older => 0
          }
}
filter {
         grok {
                 match =>  { "message" => "%{COMBINEDAPACHELOG}" }
                }
          date{
                 match => [ "timestamp" , "dd/MMM/yyyy:HH:mm:ss Z " ]
                }
}
output {
          elasticsearch {
                     hosts => [ "localhost:9200" ]
                   }
          stdout {
                  codec => rubydebug
                }
}

Please replace the placeholders above with the relevant values for you. Once done, press “ESC + wq!”. This will save the data into the file and exit the editor.

A Logstash pipeline accepts inputs and filters of our choice and outputs to destination/destinations of our choice. This is exactly what we are defining in the config file above.  In the input section, we specified the file we want Logstash to monitor and then explicitly mentioned that we want it to start reading from the beginning. Otherwise, Logstash only checks for the updates (i,e. tails the file) and by mentioning ignore_older, we are telling Logstash to consider old files too.

In the output section, we are specifying that we want the output redirected to Elasticsearch that is running on port 9200 on our local machine. Apart from this, we also want it to display the output to standard output i,e. our screen. codec is the name of Logstash codec. Codecs provide a way to encode and decode input and output data.

Run Logstash with the config file like so

$/usr/share/logstash/bin/logstash  -f  /etc/logstash/conf.d/logstash.conf

(note – Logstash dumps a lot of messages on the screen and may take sometime to start up)

Successfully started pipeline implies Logstash started correctly and is listening to any changes on the log file. Any new updates to the log file will be displayed here.

For exhaustive list of options for the config file, please check the Elasticsearch official documentation –  https://www.elastic.co/guide/en/logstash/current/configuration-file-structure.html

Examples for config file can be referenced on –  https://www.elastic.co/guide/en/logstash/current/config-examples.html

Troubleshooting Logstash installation:

It’s common to run into installation issues while setting up Logstash. We have described some common cases here.

Unrecognized VM option ‘UseParNewGC’

This indicates that the Java version on the system is not compatible with Logstash. At the time of writing of this post, Logstash requires java 8 and is not compatible with higher versions. So, first let’s check the version present on our system

$java -version

To check all the java versions available on the system, you can type

$update-java-alternatives --list

This will display the available java versions. From the screenshot below, we can see that java version 8 is already present on the system.

So, we can either make version 8 as default version or export the java_home variable. We will export java_home and then run logstash.

$echo $JAVA_HOME

$export JAVA_HOME="/usr/lib/jvm/java-8-oracle"

echo $JAVA_HOME will display the path of java home if its set. A blank implies java home variable is not set.  This command gives the complete path to java installation and we can use it in the export command to set new java_home variable.

Once export runs successfully, you can use “echo” command as shown above and check if the java_home is set to version 8. You can then start Logstash and it should run successfully.

In case java version 8 is not available on your system, you can download either from oracle or from openjdk.

$sudo apt-get install oracle-java8-installer

Installation will dump a lot of messages on the screen as shown below

You can now export this path into java_home and run Logstash again and it should be good to go.

With this we are moving closer to our final post in the ELK series, wherein we will talk about a case study outlining a real life use case involving ELK stack. Stay tuned!

Introduction to Visualizations using Kibana with Elasticsearch

We talked about Kibana installation on Ubuntu in the previous blog post, let’s look at how to setup some basic visualizations using Kibana with Elasticsearch.

Log into Kibana using  http://127.0.0.1:5601. Once the Kibana page opens up, from the left side Menu, click on “Management” and then choose Index Patterns->create Index Pattern and fill in the data as shown below (Once you start typing, it should display “bank” as an option). Go ahead and click next on step 2 and the index will be created and the fields will be displayed.

All the fields that are part of the accounts.json file will be displayed here (the file can be downloaded as per steps in the first blog post of this Elasticsearch series)

Let’s get a basic understanding of what Kibana offers. From left sidebar Menu, choose the first option “Discover”. Kibana will load data for index “bank”. If you already have more than one index, you can choose from the dropdown as shown below

You can use the filters on top of the screen to fetch the data of your choice

You can type them in the field provided above as follows (we are querying for all those accounts where the balance is greater than 40000)

Now, let’s see how many accounts are present for different age groups. So, select  age field and click on visualize

You will get an age wise descending graph depicting the number of accounts present for a certain age with an account balance greater than 40,000

We can also use “Visualize” option from the left side Menu to create useful visualizations for the data. Let’s create a pie chart.

Click on Visualize->Pie

Choose split slices and then aggregation=Range, Field=balance, From and To fields you can give as below or however you wish your data to be divided. The  balance ranges are all shown in different colors, depicting the number of accounts in each balance range.

Let’s try another visualization. Click on Visualization->Area(in basic charts). choose the index “bank” and then under “buckets” choose “split chart”->Aggregation=Range, Field=balance, and enter the different ranges for balance. You get to see something like this

Hmm, not very useful.  Let’s change it a bit. Choose “X-axis”under “buckets” and then repeat the same data for aggregation, field and balance ranges.

Well, that’s definitely more readable. It’s obvious that the number of accounts are higher for 10000-20000 range than 0-10000. Then, there’s a very slight increase for range 20000-30000 and then it reduces for 30000-40000.  Kibana offers a rich suite of visualizations, we just need to learn how to select the right option to make sense of the data that we have.

We can read more about the visualizations, fields, parameters and advanced options offered by Kibana on the official documentation page. You can check it out here:

Now that we are familiar with Kibana interface, lets do some basic search operations using Elasticsearch and replicate the same using Kibana. To do this, we shall use the accounts.json file again. Let’s use that data and gather some insights.

So, let’s say the bank wishes to know about the number of customers it has in each age group. We need to group together data based on customers’ age. This type of clustering of data is called aggregation and is one of the most powerful ways for data analysis.

On the command line, you can invoke this aggregation using curl (on the server running Elasticsearch)

curl -H -XGET 'localhost:9200/bank/account/_search?size=0&pretty' -d '
{
  "aggs": {
           "age": {
                   "terms": {
                             "field": "age"
                             }
                  }
          }
}'

Description of the query:

_search in the Curl request says our request is about searching the data, Size=0 specifies that we do not want the entire output of matched documents printed onto our screen.  This makes sure that only the result of our aggregation is displayed onto the screen. “pretty” indicates, the output of the query should be in readable format, otherwise, we will have a log file kind of output which is user friendly.

“aggs” says its an aggregate function that we are requesting and “age” is the name of our custom aggregate and “terms” indicates the fields we will be using for running our aggregation.  “field” implies each field we will be using.

and the output will look like

The output shows that the data has been grouped by age along with the number of customers who fall in each age group. (“key” refers to the age of the customer and “doc_count” to the total number of customers who are of the mentioned age).

Let’s visualize the same output on Kibana (you can cross check with the data on the screen above)

To sort by the age of the customer, choose Alphabetical as shown in the screen below:

Let’s say we want to know the number of customers with bank balance greater than 10000, for different age groups. This requires us to filter our search for balance greater than 10,000 and then aggregate them by the age.

If you wish to see all the documents that are being considered for aggregation, you can run the above command without the “size=0” option. It will display all the documents being considered for aggregation along with the aggregation results.

The output will look like so (Each document whose search criteria is met, will be displayed):

Querying for the same data using Python

import requests
res = requests.get('http://localhost:9200')
print(res.content)

from elasticsearch import Elasticsearch
es = Elasticsearch([{'host': 'localhost', 'port': 9200}])

body = {
"query": {
"term": {
"account_number": 99
}
},
}
output=es.search(index='bank', doc_type='account', body=body)
for doc in output['hits']['hits']:
print("%s) %s" % (doc['_id'], doc['_source']))

I would highly recommend that you practice these visualizations (and some more) on your local dev machine so that you can get a much deeper understanding of these concepts.

Kibana Installation and Troubleshooting Tips

In the previous blog post, we gave a bird’s eye introduction to Elasticsearch, which is a highly scalable open-source full-text search and analytics engine.  But, everyone does not get a kick out of typing commands for querying data using Elasticsearch. Is there another way around?

What is Kibana? Why is it required when I already have Elasticsearch?

Well, as they say – a picture is worth a thousand words. We have an open source analytics and visualization platform called Kibana that plays nice with Elasticsearch. Additionally, setting up Kibana is easy and you can access the underlying data without having to write a single line of code.  Also, Kibana uses web browser as its interface, so you do not need to install any tools for accessing Kibana.

Let’s get started with Kibana and you will see how easy it is to set it up.

Installing Kibana and troubleshooting

Command for installing Kibana on the same server as Elasticsearch:

$sudo apt-get install kibana

Once kibana installs, we change the host entry as in elasticsearch to open it for access from anywhere. This should not be done for production systems. Type the following:

$sudo vi /etc/kibana/kibana.yml

Use arrow keys to move down the file. We need to change the following line to open access from anywhere (again, this configuration should not be used on production systems).

Server.host : “localhost” this needs to changed to server.host : “0.0.0.0”

Now, we will add kibana to the daemon services, so it starts automatically on system startup

$sudo /bin/systemctl daemon-reload

$sudo /bin/systemctl enable kibana.service

$sudo /bin/systemctl start kibana.service

We have now started the service, we can access kibana from web browser by typing 127.0.0.1:5601 (kibana runs on port 5601). If everything goes fine, you will see the kibana home page

Troubleshooting:

What if kibana home page doesn’t open up as shown above, but you get a page with a “cannot be displayed” error!

In that case, first we need to check if kibana is up and running. To do this, type the following command

$netstat -a -n | grep tcp | grep 5601

You will see something like so (Kibana is up and running on the port 5601. LISTEN implies, it is listening for incoming requests and ready to process them)

What if nothing comes up? It means, kibana service has not started for some reason.  One way to check for the underlying cause :

$journalctl -u kibana.service

journalctl Linux command is used to query the contents of journal written by Linux service manager. Earlier, we started Kibana as a service (using command ‘systemctl start kibana.service’ ) and hence the logging data can be retrieved using journalctl.

You will see a log similar to the one below. As you see, kibana has logged its activity here and we can read through the log to figure out what went wrong with the service we started. In this case, kibana did not start since device seems to be running out of space.

The above command might output a lot of information which might not even be of recent attempted runs of kibana. To check what has been logged for the most recent run, the following command can be used

$journalctl -u kibana.service --since today

To display a set number of lines(n) you can use the syntax below (we are displaying last 20 lines)

$journalctl -n 20

Coming to the issue – one way to deal with space scarcity is to check for archives data that fills up quite fast on the server if left unchecked. Type the following at the prompt

$sudo du -sh /var/cache/apt/archives

Output will be something similar to as shown above (496M is the space occupied). I have  cleared my archives with the command below and as you see, the updated occupied space is only 16k now).

If the space occupied is quite high, you can clear them using

$sudo apt-get clean

Sometimes logs (that we check through journalctl command) also take up a lot of space. Run the following command to check the space used

$journalctl --disk-usage

If you wish to clear them, the following command can be used (to clear all logs except for last 1 day. You can give 1 year if you wish to retain all year old logs and so on, based on your requirement)

$sudo journalctl --vacuum-time=1day

Now rerun the above command to check the updated space occupied by archives.

You can try restarting kibana as shown below and see if it works:

$sudo /bin/systemctl restart kibana

To check status of kibana you can use:

$sudo /bin/systemctl status kibana.service

As we are using Virtualbox to run Ubuntu and kibana didn’t start even after clearing archives, we had to allocate more device space and I have outlined the steps as shown below.

Before making any changes to device space , please logout of ubuntu and then the virtualBox.  We need the exact location of the virtual box space.

Goto VirtualBox -> ubuntu ->settings->storage->select vdi as shown below from left panel.

Copy everything it shows in the location and exit out of VirtualBox.

Now, open terminal and type the following

(The actual Syntax of the command is

sudo VBoxManage modifyhd  “location_where_disk_is” -resize “to new size in MBs”)

$sudo VBoxManage modifyhd /Users/<my_name>/VirtualBox\ VMs/ubuntu-elastic/ubuntu-elastic.vdi –resize 25600

This will increase the disk size to the new size. Once this completes successfully, repeat steps above for restarting the services and check if they have run successfully.

That’s it folks, look forward to your comments or questions on Kibana.

Beginners Guide to Elasticsearch

What is Elasticsearch and why should I know about it?

Elasticsearch is a highly scalable open-source full-text search and analytics engine. It allows you to store, search and analyze big volumes of data quickly and in near real time (Numbers, text, geo, structured, unstructured. All data types are welcome). It is generally used as the underlying engine/technology that powers applications that have complex search features and requirements.

OK, so what can I do with it?

Well, you can develop a technology product using Elasticsearch, that can help in solving following use cases for –

  • A Blogger – Are a lot of people talking about me on Facebook/Twitter/Linkedin? (Number of people who tagged/shared/mentioned me)
  • A Budding Star – How popular Am I? Only my fans know me or the rest of the world as well?
  • A Musician – I would like to hold a concert outside my country, how do I know where I’m popular?
  • A Sales Manager – Are people talking about our product? Which social media platform is good for us ? Twitter? Facebook? Linkedin? (Breaking up and analyzing the data geography-wise in order to give a bird’s eye view of the scenario)
  • A B2C manager – Once in a while, we end up with a customer who is unable to complete his order. We are forced to manually step in and complete his process. Is there a better way? (Analyzing and making sense of thousands of log files and pinpoint the cause of issue)
It is useful to look at some of the common terms in the world of Elasticsearch, and understand their definitions. If you would like, please look at the source here:
(https://www.elastic.co/guide/en/elasticsearch/reference/current/_basic_concepts.html)

Cluster

A cluster is a collection of one or more nodes (servers) that holds together your entire data and provides federated indexing and search capabilities across all nodes. A cluster is identified by a unique name which by default is “elasticsearch” for an elasticsearch cluster. Grouping together of nodes (or servers or individual computers) helps in faster querying and load balancing.

Node

A node is a single server that is part of your cluster, stores your data, and participates in the cluster’s indexing and search capabilities. Just like a cluster, a node is identified by a name which by default is a random Universally Unique Identifier (UUID) that is assigned to the node at startup.

Index

An index is a collection of documents that have somewhat similar characteristics. For example, you can have an index for customer data, another index for a product catalog, and yet another index for order data.

If you already have an idea of structured Databases, you can equate an Index to a Database.

Type

A type used to be a logical category/partition of your index to allow you to store different types of documents in the same index, eg one type for users, another type for blog posts. It is no longer possible to create multiple types in an index, and the whole concept of types has been removed in Elasticsearch 6.

A type stands equal to a table of a database.

Why are mapping types being removed?

(taken from official elasticsearch documentation https://www.elastic.co/guide/en/elasticsearch/reference/current/removal-of-types.html)

Initially, we spoke about an “index” being similar to a “database” in an SQL database, and a “type” being equivalent to a “table”.

This was a bad analogy that led to incorrect assumptions. In an SQL database, tables are independent of each other. The columns in one table have no bearing on columns with the same name in another table. This is not the case for fields in a mapping type.

In an Elasticsearch index, fields that have the same name in different mapping types are backed by the same Lucene field internally. In other words, using the example above, the user_name field in the user type is stored in exactly the same field as the user_name field in the tweet type, and both user_name fields must have the same mapping (definition) in both types.

On top of that, storing different entities that have few or no fields in common in the same index leads to sparse data and interferes with Lucene’s ability to compress documents efficiently.

For these reasons, the concept of mapping types from Elasticsearch has been removed.

Document

A document is a basic unit of information that can be indexed. For example, you can have a document for a single customer, another document for a single product, and yet another for a single order. This document is expressed in JSON (JavaScript Object Notation) which is a ubiquitous internet data interchange format.

A document stands equal to a row of a table.

Within an index/type, you can store as many documents as you want. Note that although a document physically resides in an index, a document actually must be indexed/assigned to a type inside an index.

Shards & Replicas

An index can potentially store a large amount of data that can exceed the hardware limits of a single node. And result in a node that may be too slow to serve search requests alone.

To solve this problem, Elasticsearch provides the ability to subdivide your index into multiple pieces called shards. When you create an index, you can simply define the number of shards that you want. Each shard is in itself a fully-functional and independent “index” that can be hosted on any node in the cluster.

The mechanics of how a shard is distributed and also how its documents are aggregated back into search requests are completely managed by Elasticsearch and is transparent to you as the user.

In a network/cloud environment where failures can be expected anytime, it is very useful and highly recommended to have a failover mechanism in case a shard/node somehow goes offline or disappears for whatever reason. To this end, Elasticsearch allows you to make one or more copies of your index’s shards into what are called replica shards, or replicas for short.

How are Lucene and Elasticsearch connected?

Lucene or Apache Lucene is a high-performance, full-featured text search engine library written entirely in Java. It is a technology suitable for nearly any application that requires full-text search, especially cross-platform. Elasticsearch is built over Lucene.

For now, this glossary of terms is good enough to get up and running with a basic setup of Elasticsearch. 

Installing Elasticsearch on ubuntu

Official Debian package installation can be followed from

https://www.elastic.co/guide/en/elasticsearch/reference/current/deb.html

This page talks in detail about all the steps, configurations and issues.

If you wish to fast forward to a quick installation, you can follow the steps below (to install Elasticsearch on Ubuntu):

  • Download and install the public signing key:
wget -qO - https://artifacts.elastic.co/GPG-KEY-elasticsearch | sudo apt-key add -
  • Installing apt-transport-https will find the appropriate versions on the repositories.
sudo apt-get install apt-transport-https
  • Save the repository definition to /etc/apt/sources.list.d/elastic-6.x.list
echo "deb https://artifacts.elastic.co/packages/6.x/apt stable main" | sudo tee -a /etc/apt/sources.list.d/elastic-6.x.list
  • Install Elasticsearch package
 sudo apt-get update && sudo apt-get install elasticsearch
  • To be able to access Elasticsearch from anywhere, make the following changes (not for production systems)
sudo vi /etc/elasticsearch/elasticsearch.yml

(this opens up the file in an editor to help us make changes)

Press the arrow keys to move down in the file and locate the following line

Network.host : 0.0.0.0 (this line is generally commented, uncomment it and update the host to all zeros as shown)

Press esc and then press :wq and enter. This will save the changes to the file and exit the file.

  • To configure Elasticsearch to start automatically when the system boots up, run the following commands:
sudo /bin/systemctl daemon-reload

sudo /bin/systemctl enable elasticsearch.service
  • To start Elasticsearch
sudo systemctl start elasticsearch.service
  • Let’s test our Elasticsearch by typing
curl 127.0.0.1:9200

     You will see something on similar lines.

Sample data sets are available on

https://www.elastic.co/guide/en/kibana/current/tutorial-load-dataset.html

Download the accounts json zip file (ready to use as it doesn’t require mapping) on the server using the command

wget https://download.elastic.co/demos/kibana/gettingstarted/accounts.zip

Unzip using

unzip accounts.zip

If Unzip is not available, download Unzip software using the command

sudo apt install unzip

(you might be asked for sudo password-enter and continue)

Once Unzip is successfully installed, you can run the above command and unzip the accounts.zip file. On successful completion, this creates the accounts.json file. We will use this file as our input, to enter data for our search queries.

Run the following command to insert the json created above

curl -H 'Content-Type: application/x-ndjson' -XPOST 'localhost:9200/bank/account/_bulk?pretty' --data-binary @accounts.json

(here, our index is ‘bank’ and our document is ‘account’)

Accessing our Elasticsearch from Python Client:

To install Elasticsearch package on Python, run the following command from terminal of Python installation

$pip install elasticsearch

Let’s now look at a very simple python program to connect to our Elasticsearch engine and query for data.

from elasticsearch import Elasticsearch

es = Elasticsearch([{'host': 'localhost', 'port': 9200}])

es.get(index='bank', doc_type='account', id=990)

Output on the console will look like so : (data is retrieved from the accounts.json file that we had setup into our Elasticsearch cluster in the previous step). 

Linear regression using Apache Spark MLlib

What is linear Regression?

Wikipedia states – In statistics, linear regression is a linear approach to modeling the relationship between  dependent variable and one or more  independent variables.

Linear regression is a basic and commonly used type of predictive analysis.

Back to school math, every straight line can be represented by the equation: y = mx + b, where y is dependent variable and X is the independent variable on which y depends. 

How can we use regression for a real life use case?! Let’s take an example – what if I have data of past 5-10 years on the quantity of wheat produced annually. With Linear regression, I will be able to predict what the wheat production would be this year or a year from now.

Why is prediction so important? It helps us plan and be prepared.  It helps us deal with unforeseen situations. In the context of above example, a country will know the quantity of wheat it can export/import in a year.  It means, if global demand seems much lower than the quantity we foresee producing,  we can help farmers choose some other crop, since less demand means a bare minimum selling rate.

Some additional use cases:

  1. A consistent increase in Blood pressure and sugar levels, Is the patient heading towards heart attack?
  2. A distinctive seismic activity, are we in for a tsunami /earth quake?
  3. Inward migration of birds increasing on yearly basis. Are a certain species of trees responsible for alluring birds?
  4. Will a certain steel stock move up or down this year?

These are all some basic uses cases of Linear regression model. We call it regression, because we will be predicting continuous values (as opposed to a yes or no result). Graphically, a linear equation (with one dependent and one independent variable) looks similar to this

So, for every X , we can derive the value of Y from the equation.

What happens if Y is dependent on , not just one independent variable (X) but few more variables.  The graph above will have not just X,Y axis but Z axis too to represent the second independent variable. More than two independent variables are hard to depict graphically. But, can be represented quite easily with equation as

y=   β + β1×1 + β2×2….

where β 1,β2…βn are coefficients of X1, X2,X3

β  is the y-intercept

What this implies for our wheat production example is:

y(total wheat produced in a year) =  β +  β1*Total available land in acres + β2 * rainfall received for that year +  β3 * fertiliser availability in the market….. Here, rainfall received, fertiliser availability are the added independent variables apart from the size of land in acres.

Where does Machine Learning come into picture?

In the above equation – β1, β2, β3…are all coefficients whose value we need to compute to form the linear equation.  This is where learning happens. Based on past data, ML learns the values of these coefficients through a number of iterations.

How?

To start with, we feed in data of past few years to ML packages, this is called training data, since it helps train the ML model. Through a large chunk of data and a number of iterations, the model learns the values of each of the coefficients.

We can then evaluate the model, ML packages offers a lot of functions and related parameters to know how the model has performed. Based on these values we decide if the model has learnt “enough” and then use this on the data we need to predict for (test data).

What is the error people talk about in ML?

If all data points (Y1,Y2,Y3….) form a perfect linear line as shown above, we can derive an exact output (prediction) we are looking. But, in the real world, this is not the case. The data points do not exactly form a line.  They are a bit scattered on the graph. So what do we do?  We make a line in such a way that its at a least possible distance from the points as shown below:

this is where root mean square error or any other error reduction method is used. So, we use the chosen method, and come out with a line (which best depicts the given data points)as output. Based on this line we predict the outcome, this is where our final values come from.

Let’s look at a very basic example of linear regression in PySpark

I have downloaded the dataset from  https://www.kaggle.com/leonbora/analytics-vidhya-loan-prediction

  1. creating a Spark session
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('lr_example').getOrCreate()

2.  Importing Linear Regression packages

from pyspark.ml.regression import LinearRegression

3.  We will read the input data and see its structure

data = spark.read.csv("train.csv",inferSchema=True,header=True)
data.printSchema()
#output looks similar to this
root
 |-- Loan_ID: string (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Married: string (nullable = true)
 |-- Dependents: string (nullable = true)
 |-- Education: string (nullable = true)
 |-- Self_Employed: string (nullable = true)
 |-- ApplicantIncome: integer (nullable = true)
 |-- CoapplicantIncome: double (nullable = true)
 |-- LoanAmount: integer (nullable = true)
 |-- Loan_Amount_Term: integer (nullable = true)
 |-- Credit_History: integer (nullable = true)
 |-- Property_Area: string (nullable = true)
 |-- Loan_Status: string (nullable = true)

Machine Learning packages expect only numerical inputs and cannot accept strings. A lot of packages are present to help us transform our data to suit ML requirements. This, we will look at this aspect in another post. As of now, we will just consider two numerical fields from above data schema( ‘ApplicantIncome, CoapplicantIncome’) and we will try to predict ‘LoanAmount’ with these inputs.

4. All our input data needs to be in the form of Vectors, an input to ML packages.  So, lets get this sorted first:

from pyspark.ml.feature import (VectorAssembler, VectorIndexer)
#lets define what inputs will go into our vector and give a name for the output of it
lassembler = VectorAssembler(
    inputCols=['ApplicantIncome','CoapplicantIncome'],
    outputCol='features')

5. We will now transform our data to ML standard input

output = lassembler.transform(data)

If you have downloaded the same set, running the above command will output something similar to:

Error is pretty clear, while transforming data ML has encountered nulls in the data and is asking for a clarification of what needs to be done. We have two options, one, clean the data of null values and feed it back or tell ML packages to skip the null values. We will take the second route by adding an extra line in step 5

output = lassembler.setHandleInvalid("skip").transform(data)

Now, the code executes.  Let’s see what’s there in the “features” output

output.select("features").show()
#output looks similar to below
+-----------------+
|         features|
+-----------------+
|  [4583.0,1508.0]|
|     [3000.0,0.0]|
|  [2583.0,2358.0]|
|     [6000.0,0.0]|
|  [5417.0,4196.0]|
|  [2333.0,1516.0]|
|  [3036.0,2504.0]|
|  [4006.0,1526.0]|
|[12841.0,10968.0]|
|   [3200.0,700.0]|
|  [2500.0,1840.0]|
|  [3073.0,8106.0]|
|  [1853.0,2840.0]|
|  [1299.0,1086.0]|
|     [4950.0,0.0]|
|     [3596.0,0.0]|
|     [3510.0,0.0]|
|     [4887.0,0.0]|
|  [2600.0,3500.0]|
|     [7660.0,0.0]|
+-----------------+
only showing top 20 rows

6. Let’s now feed this input Vector and our prediction value(Y), which is ‘LoanAmount’

loan_data = output.select('features','LoanAmount')
loan_data.show()
#and Output is
+-----------------+----------+
|         features|LoanAmount|
+-----------------+----------+
|  [4583.0,1508.0]|       128|
|     [3000.0,0.0]|        66|
|  [2583.0,2358.0]|       120|
|     [6000.0,0.0]|       141|
|  [5417.0,4196.0]|       267|
|  [2333.0,1516.0]|        95|
|  [3036.0,2504.0]|       158|
|  [4006.0,1526.0]|       168|
|[12841.0,10968.0]|       349|
|   [3200.0,700.0]|        70|
|  [2500.0,1840.0]|       109|
|  [3073.0,8106.0]|       200|
|  [1853.0,2840.0]|       114|
|  [1299.0,1086.0]|        17|
|     [4950.0,0.0]|       125|
|     [3596.0,0.0]|       100|
|     [3510.0,0.0]|        76|
|     [4887.0,0.0]|       133|
|  [2600.0,3500.0]|       115|
|     [7660.0,0.0]|       104|
+-----------------+----------+
only showing top 20 rows

7. Standard practice is to divide the test data into 2 parts- train the ML model with the first and let it predict the values on second set, so we can crosscheck the performance.

#splitting train and test data into 70% and 30% of total data available
train_data,test_data = loan_data.randomSplit([0.7,0.3])

#finally, calling the linear regression package we imported
lr = LinearRegression(labelCol='LoanAmount')
#we are specifying our 'Y' by explicitly mentioned it with 'labelCol'

#fitting the model to train data set
lrModel = lr.fit(train_data)

When you run the above, you should get an output like below

The error says – “Params must be either a param map…”. Reason for this error is our dependent variable (LoanAmount) has null values and ML cannot fit a model which as null as output values.  There are lot of ways to clean this kind of data, we will not consider null values in our example.  Let’s filter out null ‘LoanAmount’ values when we read the data from csv itself like so:

data = spark.read.csv("train.csv",inferSchema=True,header=True)
#we will add a filter to remove the null values from our dependent variable
data = data.filter("LoanAmount is not NULL")
data.printSchema()

Repeat the steps above and the error will go away. So, our Linear regression Model is ready now.

8. Let’s check the residuals (residual = observed value(the value in our input) – predicted value (value predicted by the model) of Y).  Each data point will have one residual.  So, number of residuals will be equal to the number of records we fed as input.

test_results = lrModel.evaluate(test_data)
test_results.residuals.show()
#output looks similar to below
+--------------------+
|           residuals|
+--------------------+
|5.684341886080801...|
|-1.42108547152020...|
|-4.26325641456060...|
|-1.42108547152020...|
|-1.42108547152020...|
|-2.84217094304040...|
|-1.42108547152020...|
|-1.42108547152020...|
|-1.42108547152020...|
|-1.42108547152020...|
|                 0.0|
|-1.42108547152020...|
|-2.13162820728030...|
|-1.42108547152020...|
|-2.84217094304040...|
|                 0.0|
|2.842170943040400...|
|-1.42108547152020...|
|-1.42108547152020...|
|-3.55271367880050...|
+--------------------+
only showing top 20 rows

9. Voila, we can now use this model to predict output of our test data

predictions_data = test_data.select('features')
predictions = lrModel.transform(predictions_data)
predictions.show()
#output looks similar to below
+----------------+------------------+
|        features|        prediction|
+----------------+------------------+
|  [150.0,1800.0]| 102.2878796393209|
| [1299.0,1086.0]|104.99451369589991|
| [1378.0,1881.0]|113.43100472341747|
| [1500.0,1800.0]|113.66768427384854|
|[1600.0,20000.0]|292.40273753359924|
| [1782.0,2232.0]|120.26729293510716|
| [1800.0,1213.0]|110.45902065483665|
| [1820.0,1719.0]|115.57340183734365|
| [1820.0,1769.0]|116.06211641088294|
|[1836.0,33837.0]| 429.6389670546782|
|    [1880.0,0.0]| 99.27716389393049|
| [1907.0,2365.0]|122.62095931502981|
| [1926.0,1851.0]|117.75713371242067|
| [2031.0,1632.0]|116.50165979633736|
| [2130.0,6666.0]|166.53996206680586|
| [2132.0,1591.0]|116.95229182239609|
| [2137.0,8980.0]| 189.2166789246058|
|    [2221.0,0.0]|102.15161824976303|
|    [2237.0,0.0]|102.28649000839447|
| [2253.0,2033.0]|122.29249632713373|
+----------------+------------------+
only showing top 20 rows

That’s it folks.