Archives

Categories

If this helped you, please share!

Ijson coroutines and generators

Published February 27, 2020 in data - 0 Comments

Reader comments on an old post about the ijson parser prompted me to check out the project’s more recent releases. The latest pre-release (v3.0rc1) added a coroutine interface, which allow users to supply their own file readers and have more control over when the parser is called. It looked like a fun feature to explore, and here is a bit of code to try it out.

Below is some test code for parsing the large San Francisco City Lots JSON dataset using ijson v3.0rc1 coroutines. I’m also using a generator function to lazily read the JSON file line by line. As an alternative, the example in the ijson documentation shows reading from a file object in chunks. The parser_coroutine function gets the output generated by the low-level parser and prints it:

All python code is Python 3.8+.

import ijson

@ijson.coroutine
def parser_coroutine():
    while True:
        prefix, event, value = (yield)
        print(f"prefix: {prefix}, event: {event}, value: {value}")

def read_in_line(file_object):
    while True:
        line = file_object.readline()
        if not line:
            break
        yield line

parse_coro_impl = ijson.parse_coro(parser_coroutine())
with open('citylots.json', 'r') as file_reader:
    for line in read_in_line(file_reader):
        print("line:", line)
        parse_coro_impl.send(line.encode())

The dataset contains an array of JSON objects with the city lots property information. The low-level ijson parser iterates over the JSON elements and breaks them down into three element tuples that describe where the element fits in the JSON structure, it’s type and value. Here is the beginning of the dataset file and the first JSON object in the features array:

$ head -n 5 citylots.json 
{
"type": "FeatureCollection",
"features": [
{ "type": "Feature", "properties": { "MAPBLKLOT": "0001001", "BLKLOT": "0001001", "BLOCK_NUM": "0001", "LOT_NUM": "001", "FROM_ST": "0", "TO_ST": "0", "STREET": "UNKNOWN", "ST_TYPE": null, "ODD_EVEN": "E" }, "geometry": { "type": "Polygon", "coordinates": [ [ [ -122.422003528252475, 37.808480096967251, 0.0 ], [ -122.422076013325281, 37.808835019815085, 0.0 ], [ -122.421102174348633, 37.808803534992904, 0.0 ], [ -122.421062569067274, 37.808601056818148, 0.0 ], [ -122.422003528252475, 37.808480096967251, 0.0 ] ] ] } }
,
line: {

prefix: , event: start_map, value: None
line: "type": "FeatureCollection",

prefix: , event: map_key, value: type
prefix: type, event: string, value: FeatureCollection
line: "features": [

prefix: , event: map_key, value: features
prefix: features, event: start_array, value: None
line: { "type": "Feature", "properties": { "MAPBLKLOT": "0001001", "BLKLOT": "0001001", "BLOCK_NUM": "0001", "LOT_NUM": "001", "FROM_ST": "0", "TO_ST": "0", "STREET": "UNKNOWN", "ST_TYPE": null, "ODD_EVEN": "E" }, "geometry": { "type": "Polygon", "coordinates": [ [ [ -122.422003528252475, 37.808480096967251, 0.0 ], [ -122.422076013325281, 37.808835019815085, 0.0 ], [ -122.421102174348633, 37.808803534992904, 0.0 ], [ -122.421062569067274, 37.808601056818148, 0.0 ], [ -122.422003528252475, 37.808480096967251, 0.0 ] ] ] } }

prefix: features.item, event: start_map, value: None
prefix: features.item, event: map_key, value: type
prefix: features.item.type, event: string, value: Feature
prefix: features.item, event: map_key, value: properties
prefix: features.item.properties, event: start_map, value: None
prefix: features.item.properties, event: map_key, value: MAPBLKLOT
prefix: features.item.properties.MAPBLKLOT, event: string, value: 0001001
prefix: features.item.properties, event: map_key, value: BLKLOT
prefix: features.item.properties.BLKLOT, event: string, value: 0001001
prefix: features.item.properties, event: map_key, value: BLOCK_NUM
prefix: features.item.properties.BLOCK_NUM, event: string, value: 0001
prefix: features.item.properties, event: map_key, value: LOT_NUM
prefix: features.item.properties.LOT_NUM, event: string, value: 001
prefix: features.item.properties, event: map_key, value: FROM_ST
prefix: features.item.properties.FROM_ST, event: string, value: 0
prefix: features.item.properties, event: map_key, value: TO_ST
prefix: features.item.properties.TO_ST, event: string, value: 0
prefix: features.item.properties, event: map_key, value: STREET
prefix: features.item.properties.STREET, event: string, value: UNKNOWN
prefix: features.item.properties, event: map_key, value: ST_TYPE
prefix: features.item.properties.ST_TYPE, event: null, value: None
prefix: features.item.properties, event: map_key, value: ODD_EVEN
prefix: features.item.properties.ODD_EVEN, event: string, value: E
prefix: features.item.properties, event: end_map, value: None
prefix: features.item, event: map_key, value: geometry
prefix: features.item.geometry, event: start_map, value: None
prefix: features.item.geometry, event: map_key, value: type
prefix: features.item.geometry.type, event: string, value: Polygon
prefix: features.item.geometry, event: map_key, value: coordinates
prefix: features.item.geometry.coordinates, event: start_array, value: None
prefix: features.item.geometry.coordinates.item, event: start_array, value: None
prefix: features.item.geometry.coordinates.item.item, event: start_array, value: None
prefix: features.item.geometry.coordinates.item.item.item, event: number, value: -122.422003528252475
prefix: features.item.geometry.coordinates.item.item.item, event: number, value: 37.808480096967251
prefix: features.item.geometry.coordinates.item.item.item, event: number, value: 0.0
prefix: features.item.geometry.coordinates.item.item, event: end_array, value: None
prefix: features.item.geometry.coordinates.item.item, event: start_array, value: None
prefix: features.item.geometry.coordinates.item.item.item, event: number, value: -122.422076013325281
prefix: features.item.geometry.coordinates.item.item.item, event: number, value: 37.808835019815085
prefix: features.item.geometry.coordinates.item.item.item, event: number, value: 0.0
prefix: features.item.geometry.coordinates.item.item, event: end_array, value: None
prefix: features.item.geometry.coordinates.item.item, event: start_array, value: None
prefix: features.item.geometry.coordinates.item.item.item, event: number, value: -122.421102174348633
prefix: features.item.geometry.coordinates.item.item.item, event: number, value: 37.808803534992904
prefix: features.item.geometry.coordinates.item.item.item, event: number, value: 0.0
prefix: features.item.geometry.coordinates.item.item, event: end_array, value: None
prefix: features.item.geometry.coordinates.item.item, event: start_array, value: None
prefix: features.item.geometry.coordinates.item.item.item, event: number, value: -122.421062569067274
prefix: features.item.geometry.coordinates.item.item.item, event: number, value: 37.808601056818148
prefix: features.item.geometry.coordinates.item.item.item, event: number, value: 0.0
prefix: features.item.geometry.coordinates.item.item, event: end_array, value: None
prefix: features.item.geometry.coordinates.item.item, event: start_array, value: None
prefix: features.item.geometry.coordinates.item.item.item, event: number, value: -122.422003528252475
prefix: features.item.geometry.coordinates.item.item.item, event: number, value: 37.808480096967251
prefix: features.item.geometry.coordinates.item.item.item, event: number, value: 0.0
prefix: features.item.geometry.coordinates.item.item, event: end_array, value: None
prefix: features.item.geometry.coordinates.item, event: end_array, value: None
prefix: features.item.geometry.coordinates, event: end_array, value: None
prefix: features.item.geometry, event: end_map, value: None
prefix: features.item, event: end_map, value: None
line: ,

This is the last JSON object in the features array and end of the file:

$ tail -n 5 citylots.json 
,
{ "type": "Feature", "properties": { "MAPBLKLOT": "VACSTWIL", "BLKLOT": "VACSTWIL", "BLOCK_NUM": "VACST", "LOT_NUM": "WIL", "FROM_ST": null, "TO_ST": null, "STREET": null, "ST_TYPE": null, "ODD_EVEN": null }, "geometry": { "type": "Polygon", "coordinates": [ [ [ -122.424075168366173, 37.782909438426415, 0.0 ], [ -122.424094360186615, 37.783004345097652, 0.0 ], [ -122.423873989053888, 37.783032415674377, 0.0 ], [ -122.423624577385425, 37.783064185117638, 0.0 ], [ -122.422685009512293, 37.783183859540742, 0.0 ], [ -122.42266581942296, 37.783088951742364, 0.0 ], [ -122.422930605732873, 37.783055226671081, 0.0 ], [ -122.423605385014795, 37.782969278389494, 0.0 ], [ -122.423714717181866, 37.782955351504917, 0.0 ], [ -122.423841131273619, 37.782939249620277, 0.0 ], [ -122.424075168366173, 37.782909438426415, 0.0 ] ] ] } }

]
}
line: ,

line: { "type": "Feature", "properties": { "MAPBLKLOT": "VACSTWIL", "BLKLOT": "VACSTWIL", "BLOCK_NUM": "VACST", "LOT_NUM": "WIL", "FROM_ST": null, "TO_ST": null, "STREE
T": null, "ST_TYPE": null, "ODD_EVEN": null }, "geometry": { "type": "Polygon", "coordinates": [ [ [ -122.424075168366173, 37.782909438426415, 0.0 ], [ -122.42409436018
6615, 37.783004345097652, 0.0 ], [ -122.423873989053888, 37.783032415674377, 0.0 ], [ -122.423624577385425, 37.783064185117638, 0.0 ], [ -122.422685009512293, 37.783183
859540742, 0.0 ], [ -122.42266581942296, 37.783088951742364, 0.0 ], [ -122.422930605732873, 37.783055226671081, 0.0 ], [ -122.423605385014795, 37.782969278389494, 0.0 ]
, [ -122.423714717181866, 37.782955351504917, 0.0 ], [ -122.423841131273619, 37.782939249620277, 0.0 ], [ -122.424075168366173, 37.782909438426415, 0.0 ] ] ] } }

prefix: features.item, event: start_map, value: None
prefix: features.item, event: map_key, value: type
prefix: features.item.type, event: string, value: Feature
prefix: features.item, event: map_key, value: properties
prefix: features.item.properties, event: start_map, value: None
prefix: features.item.properties, event: map_key, value: MAPBLKLOT
prefix: features.item.properties.MAPBLKLOT, event: string, value: VACSTWIL
prefix: features.item.properties, event: map_key, value: BLKLOT
prefix: features.item.properties.BLKLOT, event: string, value: VACSTWIL
prefix: features.item.properties, event: map_key, value: BLOCK_NUM
prefix: features.item.properties.BLOCK_NUM, event: string, value: VACST
prefix: features.item.properties, event: map_key, value: LOT_NUM
prefix: features.item.properties.LOT_NUM, event: string, value: WIL
prefix: features.item.properties, event: map_key, value: FROM_ST
prefix: features.item.properties.FROM_ST, event: null, value: None
prefix: features.item.properties, event: map_key, value: TO_ST
prefix: features.item.properties.TO_ST, event: null, value: None
prefix: features.item.properties, event: map_key, value: STREET
prefix: features.item.properties.STREET, event: null, value: None
prefix: features.item.properties, event: map_key, value: ST_TYPE
prefix: features.item.properties.ST_TYPE, event: null, value: None
prefix: features.item.properties, event: map_key, value: ODD_EVEN
prefix: features.item.properties.ODD_EVEN, event: null, value: None
prefix: features.item.properties, event: end_map, value: None
prefix: features.item, event: map_key, value: geometry
prefix: features.item.geometry, event: start_map, value: None
prefix: features.item.geometry, event: map_key, value: type
prefix: features.item.geometry.type, event: string, value: Polygon
prefix: features.item.geometry, event: map_key, value: coordinates
prefix: features.item.geometry.coordinates, event: start_array, value: None
prefix: features.item.geometry.coordinates.item, event: start_array, value: None
prefix: features.item.geometry.coordinates.item.item, event: start_array, value: None
prefix: features.item.geometry.coordinates.item.item.item, event: number, value: -122.424075168366173
prefix: features.item.geometry.coordinates.item.item.item, event: number, value: 37.782909438426415
prefix: features.item.geometry.coordinates.item.item.item, event: number, value: 0.0
prefix: features.item.geometry.coordinates.item.item, event: end_array, value: None
prefix: features.item.geometry.coordinates.item.item, event: start_array, value: None
prefix: features.item.geometry.coordinates.item.item.item, event: number, value: -122.424094360186615
prefix: features.item.geometry.coordinates.item.item.item, event: number, value: 37.783004345097652
prefix: features.item.geometry.coordinates.item.item.item, event: number, value: 0.0
prefix: features.item.geometry.coordinates.item.item, event: end_array, value: None
prefix: features.item.geometry.coordinates.item.item, event: start_array, value: None
prefix: features.item.geometry.coordinates.item.item.item, event: number, value: -122.423873989053888
prefix: features.item.geometry.coordinates.item.item.item, event: number, value: 37.783032415674377
prefix: features.item.geometry.coordinates.item.item.item, event: number, value: 0.0
prefix: features.item.geometry.coordinates.item.item, event: end_array, value: None
prefix: features.item.geometry.coordinates.item.item, event: start_array, value: None
prefix: features.item.geometry.coordinates.item.item.item, event: number, value: -122.423624577385425
prefix: features.item.geometry.coordinates.item.item.item, event: number, value: 37.783064185117638
prefix: features.item.geometry.coordinates.item.item.item, event: number, value: 0.0
prefix: features.item.geometry.coordinates.item.item, event: end_array, value: None
prefix: features.item.geometry.coordinates.item.item, event: start_array, value: None
prefix: features.item.geometry.coordinates.item.item.item, event: number, value: -122.422685009512293
prefix: features.item.geometry.coordinates.item.item.item, event: number, value: 37.783183859540742
prefix: features.item.geometry.coordinates.item.item.item, event: number, value: 0.0
prefix: features.item.geometry.coordinates.item.item, event: end_array, value: None
prefix: features.item.geometry.coordinates.item.item, event: start_array, value: None
prefix: features.item.geometry.coordinates.item.item.item, event: number, value: -122.42266581942296
prefix: features.item.geometry.coordinates.item.item.item, event: number, value: 37.783088951742364
prefix: features.item.geometry.coordinates.item.item.item, event: number, value: 0.0
prefix: features.item.geometry.coordinates.item.item, event: end_array, value: None
prefix: features.item.geometry.coordinates.item.item, event: start_array, value: None
prefix: features.item.geometry.coordinates.item.item.item, event: number, value: -122.422930605732873
prefix: features.item.geometry.coordinates.item.item.item, event: number, value: 37.783055226671081
prefix: features.item.geometry.coordinates.item.item.item, event: number, value: 0.0
prefix: features.item.geometry.coordinates.item.item, event: end_array, value: None
prefix: features.item.geometry.coordinates.item.item, event: start_array, value: None
prefix: features.item.geometry.coordinates.item.item.item, event: number, value: -122.423605385014795
prefix: features.item.geometry.coordinates.item.item.item, event: number, value: 37.782969278389494
prefix: features.item.geometry.coordinates.item.item.item, event: number, value: 0.0
prefix: features.item.geometry.coordinates.item.item, event: end_array, value: None
prefix: features.item.geometry.coordinates.item.item, event: start_array, value: None
prefix: features.item.geometry.coordinates.item.item.item, event: number, value: -122.423714717181866
prefix: features.item.geometry.coordinates.item.item.item, event: number, value: 37.782955351504917
prefix: features.item.geometry.coordinates.item.item.item, event: number, value: 0.0
prefix: features.item.geometry.coordinates.item.item, event: end_array, value: None
prefix: features.item.geometry.coordinates.item.item, event: start_array, value: None
prefix: features.item.geometry.coordinates.item.item.item, event: number, value: -122.423841131273619
prefix: features.item.geometry.coordinates.item.item.item, event: number, value: 37.782939249620277
prefix: features.item.geometry.coordinates.item.item.item, event: number, value: 0.0
prefix: features.item.geometry.coordinates.item.item, event: end_array, value: None
prefix: features.item.geometry.coordinates.item.item, event: start_array, value: None
prefix: features.item.geometry.coordinates.item.item.item, event: number, value: -122.424075168366173
prefix: features.item.geometry.coordinates.item.item.item, event: number, value: 37.782909438426415
prefix: features.item.geometry.coordinates.item.item.item, event: number, value: 0.0
prefix: features.item.geometry.coordinates.item.item, event: end_array, value: None
prefix: features.item.geometry.coordinates.item, event: end_array, value: None
prefix: features.item.geometry.coordinates, event: end_array, value: None
prefix: features.item.geometry, event: end_map, value: None
prefix: features.item, event: end_map, value: None
line: 

line: ]

prefix: features, event: end_array, value: None
line: }

prefix: , event: end_map, value: None

Here is a similar example with the higher-level interface, where we can just parse the properties objects as Python dicts:

import ijson

@ijson.coroutine
def items_coroutine():
    while True:
        items_obj = (yield)
        print(items_obj.keys())

def read_in_line(file_object):
    while True:
        line = file_object.readline()
        if not line:
            break
        yield line

items_coro_impl = ijson.items_coro(items_coroutine(), 'features.item.properties')
with open('citylots.json', 'r') as file_reader:
    for line in read_in_line(file_reader):
        print("line:", line)
        items_coro_impl.send(line.encode())
$ head -n 10 citylots.json 
{
"type": "FeatureCollection",
"features": [
{ "type": "Feature", "properties": { "MAPBLKLOT": "0001001", "BLKLOT": "0001001", "BLOCK_NUM": "0001", "LOT_NUM": "001", "FROM_ST": "0", "TO_ST": "0", "STREET": "UNKNOW
N", "ST_TYPE": null, "ODD_EVEN": "E" }, "geometry": { "type": "Polygon", "coordinates": [ [ [ -122.422003528252475, 37.808480096967251, 0.0 ], [ -122.422076013325281, 3
7.808835019815085, 0.0 ], [ -122.421102174348633, 37.808803534992904, 0.0 ], [ -122.421062569067274, 37.808601056818148, 0.0 ], [ -122.422003528252475, 37.8084800969672
51, 0.0 ] ] ] } }
,
{ "type": "Feature", "properties": { "MAPBLKLOT": "0002001", "BLKLOT": "0002001", "BLOCK_NUM": "0002", "LOT_NUM": "001", "FROM_ST": "0", "TO_ST": "0", "STREET": "UNKNOW
N", "ST_TYPE": null, "ODD_EVEN": "E" }, "geometry": { "type": "Polygon", "coordinates": [ [ [ -122.42082593937107, 37.808631474146033, 0.0 ], [ -122.420858049679694, 37
.808795641369592, 0.0 ], [ -122.419811958704301, 37.808761809714007, 0.0 ], [ -122.42082593937107, 37.808631474146033, 0.0 ] ] ] } }
,
line: {

line: "type": "FeatureCollection",

line: "features": [

line: { "type": "Feature", "properties": { "MAPBLKLOT": "0001001", "BLKLOT": "0001001", "BLOCK_NUM": "0001", "LOT_NUM": "001", "FROM_ST": "0", "TO_ST": "0", "STREET": "
UNKNOWN", "ST_TYPE": null, "ODD_EVEN": "E" }, "geometry": { "type": "Polygon", "coordinates": [ [ [ -122.422003528252475, 37.808480096967251, 0.0 ], [ -122.422076013325
281, 37.808835019815085, 0.0 ], [ -122.421102174348633, 37.808803534992904, 0.0 ], [ -122.421062569067274, 37.808601056818148, 0.0 ], [ -122.422003528252475, 37.8084800
96967251, 0.0 ] ] ] } }

dict_keys(['MAPBLKLOT', 'BLKLOT', 'BLOCK_NUM', 'LOT_NUM', 'FROM_ST', 'TO_ST', 'STREET', 'ST_TYPE', 'ODD_EVEN'])
line: ,

line: { "type": "Feature", "properties": { "MAPBLKLOT": "0002001", "BLKLOT": "0002001", "BLOCK_NUM": "0002", "LOT_NUM": "001", "FROM_ST": "0", "TO_ST": "0", "STREET": "
UNKNOWN", "ST_TYPE": null, "ODD_EVEN": "E" }, "geometry": { "type": "Polygon", "coordinates": [ [ [ -122.42082593937107, 37.808631474146033, 0.0 ], [ -122.4208580496796
94, 37.808795641369592, 0.0 ], [ -122.419811958704301, 37.808761809714007, 0.0 ], [ -122.42082593937107, 37.808631474146033, 0.0 ] ] ] } }

dict_keys(['MAPBLKLOT', 'BLKLOT', 'BLOCK_NUM', 'LOT_NUM', 'FROM_ST', 'TO_ST', 'STREET', 'ST_TYPE', 'ODD_EVEN'])
line: ,

No comments yet

Leave a Reply: