Loading JSON
import findspark
findspark.init()
import pyspark
sc = pyspark.SparkContext()
One of the examples in repository accompanying the Learning Spark book I’m working through is a JSON payload of a tweet by the author.
It’s pretty data-rich– this is one result from whatever API generated the example.
open('../data/testweet.json').read()
'{"createdAt":"Nov 4, 2014 4:56:59 PM","id":529799371026485248,"text":"Adventures With Coffee, Code, and Writing.","source":"\\u003ca href\\u003d\\"http://twitter.com\\" rel\\u003d\\"nofollow\\"\\u003eTwitter Web Client\\u003c/a\\u003e","isTruncated":false,"inReplyToStatusId":-1,"inReplyToUserId":-1,"isFavorited":false,"retweetCount":0,"isPossiblySensitive":false,"contributorsIDs":[],"userMentionEntities":[],"urlEntities":[],"hashtagEntities":[],"mediaEntities":[],"currentUserRetweetId":-1,"user":{"id":15594928,"name":"Holden Karau","screenName":"holdenkarau","location":"","description":"","descriptionURLEntities":[],"isContributorsEnabled":false,"profileImageUrl":"http://pbs.twimg.com/profile_images/3005696115/2036374bbadbed85249cdd50aac6e170_normal.jpeg","profileImageUrlHttps":"https://pbs.twimg.com/profile_images/3005696115/2036374bbadbed85249cdd50aac6e170_normal.jpeg","isProtected":false,"followersCount":1231,"profileBackgroundColor":"C0DEED","profileTextColor":"333333","profileLinkColor":"0084B4","profileSidebarFillColor":"DDEEF6","profileSidebarBorderColor":"FFFFFF","profileUseBackgroundImage":true,"showAllInlineMedia":false,"friendsCount":600,"createdAt":"Aug 5, 2011 9:42:44 AM","favouritesCount":1095,"utcOffset":-3,"profileBackgroundImageUrl":"","profileBackgroundImageUrlHttps":"","profileBannerImageUrl":"","profileBackgroundTiled":true,"lang":"en","statusesCount":6234,"isGeoEnabled":true,"isVerified":false,"translator":false,"listedCount":0,"isFollowRequestSent":false}}\n'
In PySpark
If we wanted to work with this data in PySpark, we’d first have to set up a SparkSession
object.
spark = pyspark.sql.SparkSession(sc)
And use it’s read.json
method.
data = spark.read.json('../data/testweet.json')
We de-tangle some of the complexity of this object using the printSchema
method
data.printSchema()
root
|-- contributorsIDs: array (nullable = true)
| |-- element: string (containsNull = true)
|-- createdAt: string (nullable = true)
|-- currentUserRetweetId: long (nullable = true)
|-- hashtagEntities: array (nullable = true)
| |-- element: string (containsNull = true)
|-- id: long (nullable = true)
|-- inReplyToStatusId: long (nullable = true)
|-- inReplyToUserId: long (nullable = true)
|-- isFavorited: boolean (nullable = true)
|-- isPossiblySensitive: boolean (nullable = true)
|-- isTruncated: boolean (nullable = true)
|-- mediaEntities: array (nullable = true)
| |-- element: string (containsNull = true)
|-- retweetCount: long (nullable = true)
|-- source: string (nullable = true)
|-- text: string (nullable = true)
|-- urlEntities: array (nullable = true)
| |-- element: string (containsNull = true)
|-- user: struct (nullable = true)
| |-- createdAt: string (nullable = true)
| |-- description: string (nullable = true)
| |-- descriptionURLEntities: array (nullable = true)
| | |-- element: string (containsNull = true)
| |-- favouritesCount: long (nullable = true)
| |-- followersCount: long (nullable = true)
| |-- friendsCount: long (nullable = true)
| |-- id: long (nullable = true)
| |-- isContributorsEnabled: boolean (nullable = true)
| |-- isFollowRequestSent: boolean (nullable = true)
| |-- isGeoEnabled: boolean (nullable = true)
| |-- isProtected: boolean (nullable = true)
| |-- isVerified: boolean (nullable = true)
| |-- lang: string (nullable = true)
| |-- listedCount: long (nullable = true)
| |-- location: string (nullable = true)
| |-- name: string (nullable = true)
| |-- profileBackgroundColor: string (nullable = true)
| |-- profileBackgroundImageUrl: string (nullable = true)
| |-- profileBackgroundImageUrlHttps: string (nullable = true)
| |-- profileBackgroundTiled: boolean (nullable = true)
| |-- profileBannerImageUrl: string (nullable = true)
| |-- profileImageUrl: string (nullable = true)
| |-- profileImageUrlHttps: string (nullable = true)
| |-- profileLinkColor: string (nullable = true)
| |-- profileSidebarBorderColor: string (nullable = true)
| |-- profileSidebarFillColor: string (nullable = true)
| |-- profileTextColor: string (nullable = true)
| |-- profileUseBackgroundImage: boolean (nullable = true)
| |-- screenName: string (nullable = true)
| |-- showAllInlineMedia: boolean (nullable = true)
| |-- statusesCount: long (nullable = true)
| |-- translator: boolean (nullable = true)
| |-- utcOffset: long (nullable = true)
|-- userMentionEntities: array (nullable = true)
| |-- element: string (containsNull = true)