Fixed issue with missing commits for unmapped users

* Introduced syntetic primary key for user instead of launchpad id
* Removed start_date from release definition
* Normalize release data in storage
* Launchpad query to prod instead of stage

Change-Id: I647b197857ddf92bbd3b8c50ada6f02eaeaba5fb
This commit is contained in:
Ilya Shakhat 2013-07-30 19:40:34 +04:00
parent 5c554a35f9
commit 6c33901c05
11 changed files with 195 additions and 134 deletions

View File

@ -29,7 +29,7 @@ class CachedMemoryStorage(MemoryStorage):
self.records = {}
self.record_types_index = {}
self.module_index = {}
self.launchpad_id_index = {}
self.user_id_index = {}
self.company_index = {}
self.release_index = {}
@ -37,7 +37,7 @@ class CachedMemoryStorage(MemoryStorage):
'record_type': self.record_types_index,
'company_name': self.company_index,
'module': self.module_index,
'launchpad_id': self.launchpad_id_index,
'user_id': self.user_id_index,
'release': self.release_index,
}
@ -84,9 +84,9 @@ class CachedMemoryStorage(MemoryStorage):
map(self.get_original_company_name, companies),
self.company_index)
def get_record_ids_by_launchpad_ids(self, launchpad_ids):
def get_record_ids_by_user_ids(self, launchpad_ids):
return self._get_record_ids_from_index(launchpad_ids,
self.launchpad_id_index)
self.user_id_index)
def get_record_ids_by_releases(self, releases):
return self._get_record_ids_from_index(releases, self.release_index)
@ -119,8 +119,8 @@ class CachedMemoryStorage(MemoryStorage):
def get_modules(self):
return self.module_index.keys()
def get_launchpad_ids(self):
return self.launchpad_id_index.keys()
def get_user_ids(self):
return self.user_id_index.keys()
def get_memory_storage(memory_storage_type, records):

View File

@ -20,7 +20,9 @@
{% if user.companies %}
<div>Company: {{ user.companies[-1].company_name|link('/companies/' + user.companies[-1].company_name)|safe }}</div>
{% endif %}
<div>Launchpad: <a href="https://launchpad.net/~{{ launchpad_id }}">{{ launchpad_id }}</a></div>
{% if user.launchpad_id %}
<div>Launchpad: <a href="https://launchpad.net/~{{ user.launchpad_id }}">{{ user.launchpad_id }}</a></div>
{% endif %}
</div>
<h3>Commits history</h3>

View File

@ -31,7 +31,7 @@ from stackalytics.openstack.common import log as logging
from stackalytics.processor import config
from stackalytics.processor import persistent_storage
from stackalytics.processor import runtime_storage
from stackalytics.processor import user_utils
from stackalytics.processor import utils
# Constants and Parameters ---------
@ -86,9 +86,15 @@ def get_vault():
memory_storage.MEMORY_STORAGE_CACHED,
vault['runtime_storage'].get_update(os.getpid()))
releases = vault['persistent_storage'].get_releases()
releases = list(vault['persistent_storage'].get_releases())
vault['start_date'] = releases[0]['end_date']
vault['end_date'] = releases[-1]['end_date']
start_date = releases[0]['end_date']
for r in releases[1:]:
r['start_date'] = start_date
start_date = r['end_date']
vault['releases'] = dict((r['release_name'].lower(), r)
for r in releases)
for r in releases[1:])
modules = vault['persistent_storage'].get_repos()
vault['modules'] = dict((r['module'].lower(),
r['project_type'].lower()) for r in modules)
@ -224,11 +230,11 @@ def record_filter(ignore=None, use_default=True):
record_ids &= (
memory_storage.get_record_ids_by_modules(modules))
if 'launchpad_id' not in ignore:
param = get_parameter(kwargs, 'launchpad_id', 'launchpad_ids')
if 'user_id' not in ignore:
param = get_parameter(kwargs, 'user_id', 'user_ids')
if param:
record_ids &= (
memory_storage.get_record_ids_by_launchpad_ids(param))
memory_storage.get_record_ids_by_user_ids(param))
if 'company' not in ignore:
param = get_parameter(kwargs, 'company', 'companies')
@ -439,16 +445,15 @@ def module_details(module, records):
return details
@app.route('/engineers/<launchpad_id>')
@app.route('/engineers/<user_id>')
@exception_handler()
@templated()
@record_filter(ignore='metric')
def engineer_details(launchpad_id, records):
def engineer_details(user_id, records):
persistent_storage = get_vault()['persistent_storage']
user = list(persistent_storage.get_users(launchpad_id=launchpad_id))[0]
user = list(persistent_storage.get_users(user_id=user_id))[0]
details = contribution_details(records)
details['launchpad_id'] = launchpad_id
details['user'] = user
return details
@ -498,8 +503,8 @@ def get_modules(records, metric_filter):
@aggregate_filter()
def get_engineers(records, metric_filter):
response = _get_aggregated_stats(records, metric_filter,
get_memory_storage().get_launchpad_ids(),
'launchpad_id', 'author')
get_memory_storage().get_user_ids(),
'user_id', 'author')
return json.dumps(response)
@ -512,15 +517,20 @@ def timeline(records, **kwargs):
releases = get_vault()['releases']
if not release_names:
flask.abort(404)
if not (set(release_names) & set(releases.keys())):
flask.abort(404)
release = releases[release_names[0]]
start_date = release_start_date = user_utils.timestamp_to_week(
user_utils.date_to_timestamp(release['start_date']))
end_date = release_end_date = user_utils.timestamp_to_week(
user_utils.date_to_timestamp(release['end_date']))
now = user_utils.timestamp_to_week(int(time.time()))
if 'all' in release_names:
start_date = release_start_date = utils.timestamp_to_week(
get_vault()['start_date'])
end_date = release_end_date = utils.timestamp_to_week(
get_vault()['end_date'])
else:
release = releases[release_names[0]]
start_date = release_start_date = utils.timestamp_to_week(
release['start_date'])
end_date = release_end_date = utils.timestamp_to_week(
release['end_date'])
now = utils.timestamp_to_week(int(time.time()))
# expand start-end to year if needed
if release_end_date - release_start_date < 52:
@ -552,7 +562,7 @@ def timeline(records, **kwargs):
array_commits_hl = []
for week in weeks:
week_str = user_utils.week_to_date(week)
week_str = utils.week_to_date(week)
array_loc.append([week_str, week_stat_loc[week]])
array_commits.append([week_str, week_stat_commits[week]])
array_commits_hl.append([week_str, week_stat_commits_hl[week]])

View File

@ -14214,7 +14214,6 @@
"releases": [
{
"release_name": "Havana",
"tag_from": "5a1376ca",
"tag_to": "HEAD"
}
]
@ -14378,25 +14377,29 @@
],
"releases": [
{
"release_name": "prehistory",
"end_date": "2011-Apr-21"
},
{
"release_name": "Diablo",
"end_date": "2011-Sep-08"
},
{
"release_name": "Essex",
"start_date": "2011-Oct-01",
"end_date": "2012-Apr-01"
"end_date": "2012-Apr-05"
},
{
"release_name": "Folsom",
"start_date": "2012-Apr-01",
"end_date": "2012-Oct-01"
"end_date": "2012-Oct-04"
},
{
"release_name": "Grizzly",
"start_date": "2012-Oct-01",
"end_date": "2013-Apr-01"
"end_date": "2013-Apr-04"
},
{
"release_name": "Havana",
"start_date": "2013-Apr-01",
"end_date": "now"
"end_date": "2013-Oct-17"
}
]

View File

@ -45,7 +45,6 @@
"releases": [
{
"release_name": "Havana",
"tag_from": "5a1376ca",
"tag_to": "HEAD"
}
]
@ -53,25 +52,29 @@
],
"releases": [
{
"release_name": "prehistory",
"end_date": "2011-Apr-21"
},
{
"release_name": "Diablo",
"end_date": "2011-Sep-08"
},
{
"release_name": "Essex",
"start_date": "2011-Oct-01",
"end_date": "2012-Apr-01"
"end_date": "2012-Apr-05"
},
{
"release_name": "Folsom",
"start_date": "2012-Apr-01",
"end_date": "2012-Oct-01"
"end_date": "2012-Oct-04"
},
{
"release_name": "Grizzly",
"start_date": "2012-Oct-01",
"end_date": "2013-Apr-01"
"end_date": "2013-Apr-04"
},
{
"release_name": "Havana",
"start_date": "2013-Apr-01",
"end_date": "now"
"end_date": "2013-Oct-17"
}
]

View File

@ -27,6 +27,7 @@ from stackalytics.processor import persistent_storage
from stackalytics.processor import rcs
from stackalytics.processor import record_processor
from stackalytics.processor import runtime_storage
from stackalytics.processor import utils
from stackalytics.processor import vcs
@ -130,6 +131,46 @@ def apply_corrections(uri, runtime_storage_inst):
runtime_storage_inst.apply_corrections(corrections)
def _read_default_persistent_storage(file_name):
try:
with open(file_name, 'r') as content_file:
content = content_file.read()
return json.loads(content)
except Exception as e:
LOG.error('Error while reading config: %s' % e)
def process_users(users):
res = []
for user in users:
if ('launchpad_id' not in user) or ('emails' not in user):
LOG.warn('Skipping invalid user: %s', user)
continue
u = utils.normalize_user(user.copy())
u['user_id'] = user['launchpad_id'] or user['emails'][0]
res.append(u)
return res
def process_releases(releases):
res = []
for release in releases:
r = utils.normalize_release(release)
res.append(r)
res.sort(key=lambda x: x['end_date'])
return res
def load_default_data(persistent_storage_inst, file_name, force):
default_data = _read_default_persistent_storage(file_name)
default_data['users'] = process_users(default_data['users'])
default_data['releases'] = process_releases(default_data['releases'])
persistent_storage_inst.sync(default_data, force=force)
def main():
# init conf and logging
conf = cfg.CONF
@ -145,9 +186,9 @@ def main():
if conf.sync_default_data or conf.force_sync_default_data:
LOG.info('Going to synchronize persistent storage with default data '
'from file %s' % cfg.CONF.default_data)
persistent_storage_inst.sync(cfg.CONF.default_data,
force=conf.force_sync_default_data)
'from file %s', cfg.CONF.default_data)
load_default_data(persistent_storage_inst, cfg.CONF.default_data,
cfg.CONF.force_sync_default_data)
return 0
runtime_storage_inst = runtime_storage.get_runtime_storage(

View File

@ -13,14 +13,11 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import json
import logging
import re
import pymongo
from stackalytics.processor import user_utils
LOG = logging.getLogger(__name__)
@ -28,22 +25,21 @@ class PersistentStorage(object):
def __init__(self, uri):
pass
def sync(self, default_data_file_name, force=False):
def sync(self, default_data, force=False):
if force:
self.clean_all()
default_data = self._read_default_persistent_storage(
default_data_file_name)
self._build_index(default_data['repos'], 'uri',
self.get_repos, self.insert_repo)
self._build_index(default_data['companies'], 'company_name',
self.get_companies, self.insert_company)
self._build_index(default_data['users'], 'launchpad_id',
self._build_index(default_data['users'], 'user_id',
self.get_users, self.insert_user)
self._build_index(default_data['releases'], 'release_name',
self.get_releases, self.insert_release)
LOG.debug('Sync completed')
def _build_index(self, default_data, primary_key, getter, inserter):
# loads all items from persistent storage
existing_items = set([item[primary_key] for item in getter()])
@ -81,14 +77,6 @@ class PersistentStorage(object):
def clean_all(self):
pass
def _read_default_persistent_storage(self, file_name):
try:
with open(file_name, 'r') as content_file:
content = content_file.read()
return json.loads(content)
except Exception as e:
LOG.error('Error while reading config: %s' % e)
class MongodbStorage(PersistentStorage):
def __init__(self, uri):
@ -127,10 +115,11 @@ class MongodbStorage(PersistentStorage):
return self.mongo.users.find(criteria)
def insert_user(self, user):
self.mongo.users.insert(user_utils.normalize_user(user))
LOG.debug('Insert new user: %s', user)
self.mongo.users.insert(user)
def update_user(self, user):
user_utils.normalize_user(user)
LOG.debug('Update user: %s', user)
launchpad_id = user['launchpad_id']
self.mongo.users.update({'launchpad_id': launchpad_id}, user)

View File

@ -19,7 +19,7 @@ import re
from launchpadlib import launchpad
from oslo.config import cfg
from stackalytics.processor import user_utils
from stackalytics.processor import utils
LOG = logging.getLogger(__name__)
@ -75,13 +75,14 @@ class CachedProcessor(RecordProcessor):
break
else:
persistent_user = None
if persistent_user:
# user already exist, merge
LOG.debug('User exists in persistent storage, add new email %s',
email)
persistent_user_email = persistent_user['emails'][0]
if persistent_user_email not in self.users_index:
raise Exception('User index is not valid')
return persistent_user
user = self.users_index[persistent_user_email]
user['emails'].append(email)
self.persistent_storage.update_user(user)
@ -89,8 +90,9 @@ class CachedProcessor(RecordProcessor):
# add new user
LOG.debug('Add new user into persistent storage')
company = (self._get_company_by_email(email) or
self.domains_index[''])
self._get_independent())
user = {
'user_id': launchpad_id,
'launchpad_id': launchpad_id,
'user_name': user_name,
'emails': [email],
@ -103,35 +105,41 @@ class CachedProcessor(RecordProcessor):
return user
def _unknown_user_email(self, email):
def _unknown_user_email(self, email, user_name):
lp_profile = None
if not re.match(r'[\w\d_\.-]+@([\w\d_\.-]+\.)+[\w]+', email):
LOG.debug('User email is not valid %s' % email)
else:
LOG.debug('Lookup user email %s at Launchpad' % email)
lp = launchpad.Launchpad.login_anonymously('stackalytics')
lp = launchpad.Launchpad.login_anonymously('stackalytics',
'production')
try:
lp_profile = lp.people.getByEmail(email=email)
except Exception as error:
LOG.warn('Lookup of email %s failed %s' %
(email, error.message))
LOG.warn('Lookup of email %s failed %s', email, error.message)
if not lp_profile:
# user is not found in Launchpad, create dummy record for commit
# update
LOG.debug('Email is not found at Launchpad, mapping to nobody')
user = {
'launchpad_id': None,
'user_id': email,
'user_name': user_name,
'emails': [email],
'companies': [{
'company_name': self.domains_index[''],
'company_name': self._get_independent(),
'end_date': 0
}]
}
# add new user
self.persistent_storage.insert_user(user)
else:
# get user's launchpad id from his profile
launchpad_id = lp_profile.name
user_name = lp_profile.display_name
LOG.debug('Found user %s' % launchpad_id)
LOG.debug('Found user %s', launchpad_id)
user = self._persist_user(launchpad_id, email, user_name)
@ -139,6 +147,9 @@ class CachedProcessor(RecordProcessor):
self.users_index[email] = user
return user
def _get_independent(self):
return self.domains_index['']
class CommitProcessor(CachedProcessor):
def __init__(self, persistent_storage):
@ -150,12 +161,16 @@ class CommitProcessor(CachedProcessor):
if email in self.users_index:
user = self.users_index[email]
else:
user = self._unknown_user_email(email)
user = self._unknown_user_email(email, commit['author'])
commit['launchpad_id'] = user['launchpad_id']
commit['user_id'] = user['user_id']
company = self._get_company_by_email(email)
if not company:
company = self._find_company(user['companies'], commit['date'])
commit['company_name'] = company
if 'user_name' in user:
commit['author_name'] = user['user_name']
@ -168,7 +183,7 @@ class CommitProcessor(CachedProcessor):
record['record_type'] = 'commit'
record['primary_key'] = record['commit_id']
record['week'] = user_utils.timestamp_to_week(record['date'])
record['week'] = utils.timestamp_to_week(record['date'])
record['loc'] = record['lines_added'] + record['lines_deleted']
yield record
@ -183,14 +198,8 @@ class ReviewProcessor(CachedProcessor):
for user in users:
self.launchpad_to_company_index[user['launchpad_id']] = user
self.releases = []
for release in persistent_storage.get_releases():
r = release.copy()
r['end_date_ts'] = user_utils.date_to_timestamp(r['end_date'])
r['release_name'] = r['release_name'].lower()
self.releases.append(r)
self.releases.sort(key=lambda x: x['end_date_ts'])
self.releases_dates = [r['end_date_ts'] for r in self.releases]
self.releases = list(persistent_storage.get_releases())
self.releases_dates = [r['end_date'] for r in self.releases]
LOG.debug('Review processor is instantiated')
@ -208,22 +217,28 @@ class ReviewProcessor(CachedProcessor):
company = self._get_company_by_email(email)
if not company:
company = self._find_company(user['companies'], date)
return company
return company, user['user_id']
def _spawn_review(self, record):
# copy everything except pathsets and flatten user data
review = dict([(k, v) for k, v in record.iteritems()
if k not in ['patchSets', 'owner']])
owner = record['owner']
company = self._process_user(owner['email'].lower(),
owner['username'],
owner['name'],
record['createdOn'])
if 'email' not in owner or 'username' not in owner:
return # ignore
review['record_type'] = 'review'
review['primary_key'] = record['id']
review['company_name'] = company
review['launchpad_id'] = owner['username']
review['author_email'] = owner['email'].lower()
review['release'] = self._get_release(review['createdOn'])
company, user_id = self._process_user(review['author_email'],
review['launchpad_id'],
owner['name'],
record['createdOn'])
review['company_name'] = company
review['user_id'] = user_id
yield review
def _spawn_marks(self, record):
@ -236,21 +251,24 @@ class ReviewProcessor(CachedProcessor):
mark = dict([(k, v) for k, v in approval.iteritems()
if k != 'by'])
reviewer = approval['by']
if 'email' not in reviewer or 'username' not in reviewer:
continue # ignore
mark['record_type'] = 'mark'
mark['primary_key'] = (record['id'] +
str(mark['grantedOn']) +
mark['type'])
mark['launchpad_id'] = reviewer['username']
mark['author_email'] = reviewer['email'].lower()
mark['module'] = record['module']
if 'email' not in reviewer:
continue
company = self._process_user(reviewer['email'],
reviewer['username'],
reviewer['name'],
mark['grantedOn'])
company, user_id = self._process_user(mark['author_email'],
mark['launchpad_id'],
reviewer['name'],
mark['grantedOn'])
mark['company_name'] = company
mark['user_id'] = user_id
mark['review_id'] = review_id
mark['release'] = self._get_release(mark['grantedOn'])

View File

@ -19,7 +19,8 @@ import time
def normalize_user(user):
user['emails'] = [email.lower() for email in user['emails']]
user['launchpad_id'] = user['launchpad_id'].lower()
if user['launchpad_id']:
user['launchpad_id'] = user['launchpad_id'].lower()
for c in user['companies']:
end_date_numeric = 0
@ -40,6 +41,12 @@ def normalize_user(user):
return user
def normalize_release(release):
release['release_name'] = release['release_name'].lower()
release['end_date'] = date_to_timestamp(release['end_date'])
return release
def date_to_timestamp(d):
if d == 'now':
return int(time.time())

View File

@ -102,7 +102,10 @@ class Git(Vcs):
for release in self.repo['releases']:
release_name = release['release_name'].lower()
tag_range = release['tag_from'] + '..' + release['tag_to']
if 'tag_from' in release:
tag_range = release['tag_from'] + '..' + release['tag_to']
else:
tag_range = release['tag_to']
git_log_iterator = sh.git('log', '--pretty=%H', tag_range,
_tty_out=False)
for commit_id in git_log_iterator:

View File

@ -42,7 +42,9 @@ class TestCommitProcessor(testtools.TestCase):
},
])
self.user = {
'launchpad_id': 'john_doe', 'user_name': 'John Doe',
'user_id': 'john_doe',
'launchpad_id': 'john_doe',
'user_name': 'John Doe',
'emails': ['johndoe@gmail.com', 'jdoe@super.no'],
'companies': [
{'company_name': '*independent',
@ -65,6 +67,13 @@ class TestCommitProcessor(testtools.TestCase):
super(TestCommitProcessor, self).tearDown()
self.launchpad_patch.stop()
def _make_commit(self, email='johndoe@gmail.com', date=1999999999):
return {
'author': 'John Doe',
'author_email': email,
'date': date,
}
def test_get_company_by_email_mapped(self):
email = 'jdoe@super.no'
res = self.commit_processor._get_company_by_email(email)
@ -86,20 +95,14 @@ class TestCommitProcessor(testtools.TestCase):
self.assertEquals(None, res)
def test_update_commit_existing_user(self):
commit = {
'author_email': 'johndoe@gmail.com',
'date': 1999999999,
}
commit = self._make_commit()
self.commit_processor._update_commit_with_user_data(commit)
self.assertEquals('SuperCompany', commit['company_name'])
self.assertEquals('john_doe', commit['launchpad_id'])
def test_update_commit_existing_user_old_job(self):
commit = {
'author_email': 'johndoe@gmail.com',
'date': 1000000000,
}
commit = self._make_commit(date=1000000000)
self.commit_processor._update_commit_with_user_data(commit)
self.assertEquals('*independent', commit['company_name'])
@ -111,10 +114,7 @@ class TestCommitProcessor(testtools.TestCase):
Should return other company instead of those mentioned in user db
"""
email = 'johndoe@nec.co.jp'
commit = {
'author_email': email,
'date': 1999999999,
}
commit = self._make_commit(email=email)
lp_mock = mock.MagicMock()
launchpad.Launchpad.login_anonymously = mock.Mock(return_value=lp_mock)
lp_profile = mock.Mock()
@ -138,10 +138,7 @@ class TestCommitProcessor(testtools.TestCase):
the user and return current company
"""
email = 'johndoe@yahoo.com'
commit = {
'author_email': email,
'date': 1999999999,
}
commit = self._make_commit(email=email)
lp_mock = mock.MagicMock()
launchpad.Launchpad.login_anonymously = mock.Mock(return_value=lp_mock)
lp_profile = mock.Mock()
@ -165,10 +162,7 @@ class TestCommitProcessor(testtools.TestCase):
Should add new user and set company depending on email
"""
email = 'smith@nec.com'
commit = {
'author_email': email,
'date': 1999999999,
}
commit = self._make_commit(email=email)
lp_mock = mock.MagicMock()
launchpad.Launchpad.login_anonymously = mock.Mock(return_value=lp_mock)
lp_profile = mock.Mock()
@ -189,10 +183,7 @@ class TestCommitProcessor(testtools.TestCase):
Should set user name and empty LPid
"""
email = 'inkognito@avs.com'
commit = {
'author_email': email,
'date': 1999999999,
}
commit = self._make_commit(email=email)
lp_mock = mock.MagicMock()
launchpad.Launchpad.login_anonymously = mock.Mock(return_value=lp_mock)
lp_mock.people.getByEmail = mock.Mock(return_value=None)
@ -209,10 +200,7 @@ class TestCommitProcessor(testtools.TestCase):
LP raises error during getting user info
"""
email = 'smith@avs.com'
commit = {
'author_email': email,
'date': 1999999999,
}
commit = self._make_commit(email=email)
lp_mock = mock.MagicMock()
launchpad.Launchpad.login_anonymously = mock.Mock(return_value=lp_mock)
lp_mock.people.getByEmail = mock.Mock(return_value=None,
@ -230,10 +218,7 @@ class TestCommitProcessor(testtools.TestCase):
User's email is malformed
"""
email = 'error.root'
commit = {
'author_email': email,
'date': 1999999999,
}
commit = self._make_commit(email=email)
lp_mock = mock.MagicMock()
launchpad.Launchpad.login_anonymously = mock.Mock(return_value=lp_mock)
lp_mock.people.getByEmail = mock.Mock(return_value=None)