Verified Commit 057b44c4 authored by Eliot Berriot's avatar Eliot Berriot

Merge branch 'release/0.1'

parents 0b785e62 26b54f0e
Pipeline #174 passed with stages
in 3 minutes and 5 seconds
stages:
- download
- convert
full-download:
image: python:3.5
stage: download
artifacts:
expire_in: 30 days
paths:
- raw.json
before_script:
- pip install -r requirements.txt
script:
- python fetch_database.py --out raw.json
only:
- master
tags:
- docker
partial-download:
image: python:3.5
stage: download
artifacts:
expire_in: 30 days
paths:
- raw.json
before_script:
- pip install -r requirements.txt
script:
- python fetch_database.py --max 100 --out raw.json
except:
- master
tags:
- docker
full-convert:
image: python:3.5
stage: convert
dependencies:
- full-download
artifacts:
expire_in: 30 days
paths:
- data.json
only:
- master
before_script:
- pip install -r requirements.txt
script:
- python convert_data.py --in raw.json --out data.json
tags:
- docker
partial-convert:
image: python:3.5
stage: convert
dependencies:
- partial-download
artifacts:
expire_in: 30 days
paths:
- data.json
except:
- master
before_script:
- pip install -r requirements.txt
script:
- python convert_data.py --in raw.json --out data.json
tags:
- docker
import argparse
import json
import collections
import datetime
import lxml.html
import logging
......@@ -23,17 +24,29 @@ def clean_database(**kwargs):
scps = sorted(scps, key=lambda s: s['id'])
logger.info('Converting scps...')
final_scps = (convert_raw_scp(scp) for scp in scps)
final_scps = [convert_raw_scp(scp) for scp in scps]
data = {
'scps': final_scps,
'tags': compute_tags_data(final_scps),
}
logger.info('Exporting data to {}'.format(kwargs['out']))
with open(kwargs['out'], 'wb') as f:
j = json.dumps(
list(final_scps),
data,
sort_keys=True,
indent=4)
f.write(j.encode('utf-8'))
def compute_tags_data(scp_data):
def get_counts():
for scp in scp_data:
for tag in scp['tags']:
yield tag
counter = collections.Counter(get_counts())
return dict(counter)
def convert_raw_scp(scp):
logger.info('Converting SCP-{}'.format(scp['id']))
root = lxml.html.fromstring(scp['content'])
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment