From c601822563e4ce6edfa785fc946f9d19d50ca331 Mon Sep 17 00:00:00 2001 From: pkholkin Date: Tue, 13 May 2014 13:45:11 +0400 Subject: [PATCH] Fixed aliases implements bp member-directory Change-Id: I1a2ee49276316c05a9fd064bb1ffa39c2f2e9606 --- etc/default_data.json | 149 +++++++----------- .../processor/default_data_processor.py | 8 +- stackalytics/processor/record_processor.py | 3 +- stackalytics/processor/utils.py | 13 ++ tests/unit/test_utils.py | 13 ++ 5 files changed, 94 insertions(+), 92 deletions(-) diff --git a/etc/default_data.json b/etc/default_data.json index 1139ad000..4e58985ad 100644 --- a/etc/default_data.json +++ b/etc/default_data.json @@ -5693,12 +5693,11 @@ { "domains": [""], "company_name": "*independent", - "aliases": ["None", "Non", "l-", ".", "****", "1", "aaa", "-", "dsadsadsadsad", "I dont have one", "company", "n/a", "Self", "Student", "home", "Home Based", "Independent", "Independen", "Independant", "MyHome", "HomeOffice", "Self Employeed", "Self Employed", "myself", "Self-employeed", "individual", "Individual Contributor", "Unaffiliated", "没有"] + "aliases": ["None", "Non", "l-", ".", "****", "1", "aaa", "-", "dsadsadsadsad", "I dont have one", "company", "n/a", "Self", "Student", "home", "Home Based", "Independent", "Independen", "Independant", "MyHome", "HomeOffice", "Self Employeed", "Self Employed", "myself", "Self-employeed", "individual", "Individual Contributor", "Unaffiliated", "没有", "Null", "Univerisity", "fsfsf", "xxx"] }, { "domains": ["360.cn"], - "company_name": "Qihoo 360 Technology Co", - "aliases": ["Qihoo 360 Technology Co. Ltd."] + "company_name": "Qihoo 360 Technology Co" }, { "domains": ["3ds.com"], @@ -5706,8 +5705,7 @@ }, { "domains": ["4loops.com"], - "company_name": "Four Loops Solutions", - "aliases": ["Four Loops Solutions Pvt. Ltd.", "Four Loops Solutions Pvt Ltd", "Four Loops Solutions Pvt. Ltd"] + "company_name": "Four Loops Solutions" }, { "domains": ["99cloud.net"], @@ -5724,8 +5722,7 @@ }, { "domains": ["alyseo.com"], - "company_name": "Alyseo", - "aliases": ["ALYSEO"] + "company_name": "Alyseo" }, { "domains": ["anl.gov"], @@ -5745,18 +5742,15 @@ }, { "domains": ["aristanetworks.com"], - "company_name": "Arista Networks", - "aliases": ["Arista Networks Inc"] + "company_name": "Arista Networks" }, { "domains": ["arubanetworks.com"], - "company_name": "Aruba Networks", - "aliases": ["Aruba Networks, Inc."] + "company_name": "Aruba Networks" }, { "domains": ["askbot.com"], - "company_name": "Askbot", - "aliases": ["Askbot, S.p.A."] + "company_name": "Askbot" }, { "domains": ["atomia.com"], @@ -5768,13 +5762,11 @@ }, { "domains": ["awcloud.com"], - "company_name": "Awcloud", - "aliases": ["awcloud"] + "company_name": "Awcloud" }, { "domains": ["b1-systems.de"], - "company_name": "B1 Systems", - "aliases": ["B1 Systems GmbH"] + "company_name": "B1 Systems" }, { "domains": ["bacoosta.com"], @@ -5782,12 +5774,12 @@ }, { "domains": ["bestbuy.com"], - "company_name": "Best Buy", - "aliases": ["Best Buy Corp."] + "company_name": "Best Buy" }, { "domains": ["bigswitch.com"], - "company_name": "Big Switch Networks" + "company_name": "Big Switch Networks", + "aliases": ["Big Switch"] }, { "domains": ["bitergia.com"], @@ -5805,7 +5797,7 @@ { "domains": ["brightcomputing.com"], "company_name": "Bright Computing", - "aliases": ["Bright Computing, BV", "Bright Computing, Inc."] + "aliases": ["Bright Computing, BV"] }, { "domains": ["brinkster.com"], @@ -5833,7 +5825,7 @@ { "domains": ["canonical.com"], "company_name": "Canonical", - "aliases": ["Canonical Ltd"] + "aliases": ["Canoncail, Ltd."] }, { "domains": ["centraldesktop.com"], @@ -5850,7 +5842,7 @@ { "domains": ["cisco.com"], "company_name": "Cisco Systems", - "aliases": ["Cisco System", "Cisco Systems", "Cisco Systems Inc.", "Cisco Systems, Inc.", "Cisco Systems Inc., Intel, Microsoft, Dorkbotz", "Cisco System, Inc., Nebula, Inc.", "Cisco", "Cisco Inc"] + "aliases": ["Cisco System", "Cisco Systems Inc., Intel, Microsoft, Dorkbotz", "Cisco System, Inc., Nebula, Inc.", "Cisco", "Cisco Inc"] }, { "domains": ["citrix.com"], @@ -5863,12 +5855,11 @@ { "domains": ["cloudbasesolutions.com"], "company_name": "Cloudbase Solutions", - "aliases": ["Cloudbase Solutions Srl", "Cloudbase"] + "aliases": ["Cloudbase"] }, { "domains": ["cloudbau.de"], - "company_name": "Cloudbau", - "aliases": ["cloudbau GmbH"] + "company_name": "Cloudbau" }, { "domains": ["cloudscaling.com"], @@ -5910,8 +5901,7 @@ }, { "domains": ["cybera.ca"], - "company_name": "Cybera", - "aliases": ["Cybera Inc"] + "company_name": "Cybera" }, { "domains": ["debian.org"], @@ -5921,7 +5911,7 @@ { "domains": ["dell.com", "software.dell.com"], "company_name": "Dell", - "aliases": ["Dell & Ganette Publishing", "Dell Inc", "Dell, Inc., Cabarrus County Schools"] + "aliases": ["Dell & Ganette Publishing", "Dell, Inc., Cabarrus County Schools", "Dell & Ganette Publishing"] }, { "domains": ["denali-systems.com"], @@ -5946,8 +5936,7 @@ }, { "domains": ["ebay.com", "ebaysf.com"], - "company_name": "eBay", - "aliases": ["ebay inc", "eBay Inc.", "eBay, Inc."] + "company_name": "eBay" }, { "domains": ["embrane.com"], @@ -5956,7 +5945,7 @@ { "domains": ["emc.com"], "company_name": "EMC", - "aliases": ["EMC corp", "EMC Corporation", "EMC Corportion", "EMC employee; Russian Cloud Computing Professional Association - Head of executive commitee", "EMC, VMWare"] + "aliases": ["EMC Corportion", "EMC employee; Russian Cloud Computing Professional Association - Head of executive commitee", "EMC, VMWare"] }, { "domains": ["endurancewindpower.com"], @@ -5964,8 +5953,7 @@ }, { "domains": ["enovance.com"], - "company_name": "eNovance", - "aliases": ["eNovance Inc"] + "company_name": "eNovance" }, { "domains": ["epam.com"], @@ -5974,7 +5962,7 @@ { "domains": ["ericsson.com"], "company_name": "Ericsson", - "aliases": ["Ericsson AB", "Ericsson Research"] + "aliases": ["Ericsson AB", "Ericsson Research", "Ericcson AB"] }, { "domains": ["fathomdb.com"], @@ -5991,8 +5979,7 @@ }, { "domains": ["fujitsu.com"], - "company_name": "Fujitsu", - "aliases": ["Fujitsu Limited"] + "company_name": "Fujitsu" }, { "domains": ["getchef.com", "opscode.com"], @@ -6004,8 +5991,7 @@ }, { "domains": ["godaddy.com"], - "company_name": "Go Daddy", - "aliases": ["GoDaddy", "Go Daddy, LLC"] + "company_name": "Go Daddy" }, { "domains": ["gplhost.com"], @@ -6048,7 +6034,7 @@ { "domains": ["hds.com"], "company_name": "Hitachi", - "aliases": ["Hitachi Data Systems", "Hitachi, Ltd.", "Hitachi,Ltd."] + "aliases": ["Hitachi Data Systems"] }, { "domains": ["hortonworks.com"], @@ -6057,7 +6043,7 @@ { "domains": ["hp.com"], "company_name": "HP", - "aliases": ["HP Cloud", "HP ES GD China", "HP, IBM", "HP Software", "HP Storage Division", "Hewlett Packard", "Hewlett-Packard Company", "Hewlett-Packard", "Hewllet-Packard"] + "aliases": ["HP Cloud", "HP ES GD China", "HP, IBM", "HP Software", "HP Storage Division", "Hewlett Packard", "Hewlett-Packard Company", "Hewlett-Packard", "Hewllet-Packard", "HP R and D", "HP Cloud OS", "HP Networking", "hewelett-packard company", "HewlettPackard", "Hewlett-Pack"] }, { "domains": ["huawei.com"], @@ -6067,7 +6053,7 @@ { "domains": ["ibm.com", "linux.vnet.ibm.com"], "company_name": "IBM", - "aliases": ["IBM Australia", "IBM Canada", "IBM Canada Ltd", "IBM China", "IBM Corporation", "IBM India Pvt Ltd", "IBM India Pvt. Ltd.", "IBM Japan, Ltd.", "IBM Research", "IBM Research - China", "IBM Research Lab, India"] + "aliases": ["IBM Australia", "IBM Canada", "IBM Canada Ltd", "IBM China", "IBM Corporation", "IBM India Pvt Ltd", "IBM India Pvt. Ltd.", "IBM Japan, Ltd.", "IBM Research", "IBM Research - China", "IBM Research Lab, India", "IBM Deutschland Research & Development GmbH", "International Business Machines Corporation"] }, { "domains": ["ifca.unican.es"], @@ -6091,7 +6077,7 @@ { "domains": ["intel.com"], "company_name": "Intel", - "aliases": ["Intel Corp.", "Intel Corporation", "Intel Media", "Intel OTC", "Intern at intel"] + "aliases": ["Intel Media", "Intel OTC", "Intern at intel", "Intel Security"] }, { "domains": ["interhost.no"], @@ -6133,8 +6119,7 @@ }, { "domains": ["izeltech.com"], - "company_name": "Izel Technologies", - "aliases": ["Izel Technologies Inc."] + "company_name": "Izel Technologies" }, { "domains": ["jhuapl.edu"], @@ -6194,8 +6179,7 @@ }, { "domains": ["maginatics.com"], - "company_name": "Maginatics", - "aliases": ["Maginatics, Inc."] + "company_name": "Maginatics" }, { "domains": ["managedit.ie"], @@ -6207,8 +6191,7 @@ }, { "domains": ["memset.com"], - "company_name": "Memset", - "aliases": ["Memset Ltd"] + "company_name": "Memset" }, { "domains": ["metacloud.com"], @@ -6220,8 +6203,7 @@ }, { "domains": ["mirantis.com", "mirantis.ru"], - "company_name": "Mirantis", - "aliases": ["Mirantis Inc", "Mirantis Inc.", "Mirantis, Inc", "Mirantis, Inc.", "Mirantis IT"] + "company_name": "Mirantis" }, { "domains": ["mit.edu"], @@ -6244,17 +6226,16 @@ { "domains": ["nebula.com", "ansolabs.com"], "company_name": "Nebula", - "aliases": ["Nebula Inc.", "Nebula, Inc. ; CFO Tools"] + "aliases": ["Nebula, Inc. ; CFO Tools", "Nebulaworks"] }, { "domains": ["nec.com", "nec.co.jp", "nectechnologies.in"], "company_name": "NEC", - "aliases": ["NEC Europe Ltd.", "NEC Soft, Ltd.", "NEC Technologies India Ltd."] + "aliases": ["NEC Europe Ltd.", "NEC Soft, Ltd.", "NEC Technologies India Ltd.", "NEC Technlogies India Ltd"] }, { "domains": ["netapp.com"], - "company_name": "NetApp", - "aliases": ["NetApp Inc", "NetApp, Inc."] + "company_name": "NetApp" }, { "domains": ["netease.com"], @@ -6279,8 +6260,7 @@ }, { "domains": ["nuagenetworks.net"], - "company_name": "Nuage Networks", - "aliases": ["nuage networks"] + "company_name": "Nuage Networks" }, { "domains": ["numergy.com", "numergy.fr"], @@ -6293,7 +6273,7 @@ { "domains": ["oneconvergence.com"], "company_name": "One Convergence", - "aliases": ["One Convergence Devices Pvt. Ltd", "One Convergence Inc.", "OneConvergence", "Oneconvergence Devices Pvt Ltd", "One Convergence Devices"] + "aliases": ["One Convergence Devices Pvt. Ltd", "Oneconvergence Devices Pvt Ltd", "One Convergence Devices"] }, { "domains": ["optiflows.com"], @@ -6301,8 +6281,7 @@ }, { "domains": ["oracle.com"], - "company_name": "Oracle", - "aliases": ["Oracle Corp."] + "company_name": "Oracle" }, { "domains": ["orange.com"], @@ -6320,12 +6299,12 @@ { "domains": ["persistent.co.in"], "company_name": "Persistent Systems", - "aliases": ["Persistent Systems Limited"] + "aliases": ["Persistent System Limited", "persistent sys limited", "Persistent Ltd"] }, { "domains": ["pistoncloud.com"], "company_name": "Piston Cloud", - "aliases": ["Piston Cloud Computing, Inc."] + "aliases": ["Piston Cloud Computing, Inc.", "Piston"] }, { "domains": ["playhaven.com"], @@ -6333,8 +6312,7 @@ }, { "domains": ["plumgrid.com"], - "company_name": "PLUMgrid", - "aliases": ["Plumgrid inc", "Plumgrid Inc."] + "company_name": "PLUMgrid" }, { "domains": ["pubyun.com"], @@ -6357,14 +6335,17 @@ "company_name": "Rackspace", "aliases": ["Rackspace, Cloudscaling, Korea Telcom, friends with lots of people", "Rackspace.com", "Rackspace Hosting"] }, + { + "domains": ["rackwareinc.com"], + "company_name": "Rackware" + }, { "domains": ["radisys.com"], "company_name": "Radisys" }, { "domains": ["radware.com"], - "company_name": "Radware", - "aliases": ["Radware Ltd."] + "company_name": "Radware" }, { "domains": ["ravellosystems.com"], @@ -6373,7 +6354,7 @@ { "domains": ["redhat.com", "gluster.com"], "company_name": "Red Hat", - "aliases": ["Red Hat Canada, Inc", "Red Hat Czech, s.r.o.", "Red Hat Inc.", "Red Hat, Inc., Bloomberg L.P.", "Red Hat India Pvt. Ltd.", "Red Hat Software", "RedHat"] + "aliases": ["Red Hat Canada, Inc", "Red Hat Czech, s.r.o.", "Red Hat, Inc., Bloomberg L.P.", "Red Hat India Pvt. Ltd.", "Red Hat Software"] }, { "domains": ["reduxio.com"], @@ -6394,8 +6375,7 @@ }, { "domains": ["scality.com"], - "company_name": "Scality", - "aliases": ["Scality Inc"] + "company_name": "Scality" }, { "domains": ["sdsc.edu"], @@ -6417,8 +6397,7 @@ }, { "domains": ["snabb.co"], - "company_name": "Snabb", - "aliases": ["Snabb GmbH"] + "company_name": "Snabb" }, { "domains": ["softlayer.com"], @@ -6435,8 +6414,7 @@ }, { "domains": ["spilgames.com"], - "company_name": "Spil Games", - "aliases": ["Spil Games B.V."] + "company_name": "Spil Games" }, { "domains": ["stackinsider.com"], @@ -6465,13 +6443,11 @@ }, { "domains": ["swiftstack.com"], - "company_name": "SwiftStack", - "aliases": ["SwiftStack Inc."] + "company_name": "SwiftStack" }, { "domains": ["switch.ch"], - "company_name": "Switch", - "aliases": ["SWITCH"] + "company_name": "Switch" }, { "domains": ["symantec.com"], @@ -6488,18 +6464,16 @@ }, { "domains": ["telekom.de"], - "company_name": "Deutsche Telekom", - "aliases": ["Deutsche Telekom AG"] + "company_name": "Deutsche Telekom" }, { "domains": ["tesora.com", "parelastic.com"], "company_name": "Tesora Corp", - "aliases": ["ParElastic Corp", "ParElastic"] + "aliases": ["ParElastic Corp", "ParElastic", "Tesora.com"] }, { "domains": ["thalesgroup.com", "mythalesgroup.com"], - "company_name": "Thales", - "aliases": ["Thales Group"] + "company_name": "Thales" }, { "domains": ["thoughtworks.com"], @@ -6515,8 +6489,7 @@ }, { "domains": ["tunnelvisionlabs.com"], - "company_name": "Tunnel Vision Laboratories", - "aliases": ["Tunnel Vision Laboratories, LLC"] + "company_name": "Tunnel Vision Laboratories" }, { "domains": ["ubisoft.com"], @@ -6578,7 +6551,7 @@ { "domains": ["vmware.com", "nicira.com"], "company_name": "VMware", - "aliases": ["CYSO VMWARE DHPA"] + "aliases": ["CYSO VMWARE DHPA", "VMware, Nicira, Telstra, Accenture"] }, { "domains": ["wanclouds.net"], @@ -6608,13 +6581,11 @@ }, { "domains": ["xlab.si"], - "company_name": "Xlab", - "aliases": ["XLAB d.o.o."] + "company_name": "Xlab" }, { "domains": ["yahoo-inc.com"], - "company_name": "Yahoo!", - "aliases": ["Yahoo"] + "company_name": "Yahoo!" }, { "domains": ["yandex-team.ru"], diff --git a/stackalytics/processor/default_data_processor.py b/stackalytics/processor/default_data_processor.py index 79212e0d2..ea24a14e0 100644 --- a/stackalytics/processor/default_data_processor.py +++ b/stackalytics/processor/default_data_processor.py @@ -124,7 +124,11 @@ def _store_companies(runtime_storage_inst, companies): if 'aliases' in company: for alias in company['aliases']: - domains_index[alias] = company['company_name'] + normalized_alias = utils.normalize_company_name(alias) + domains_index[normalized_alias] = company['company_name'] + normalized_company_name = utils.normalize_company_name( + company['company_name']) + domains_index[normalized_company_name] = company['company_name'] runtime_storage_inst.set_by_key('companies', domains_index) @@ -175,7 +179,7 @@ def _get_changed_member_records(runtime_storage_inst, record_processor_inst): if record['record_type'] == 'member' and 'company_name' in record: company_draft = record['company_draft'] company_name = record_processor_inst.domains_index.get( - company_draft) or company_draft + utils.normalize_company_name(company_draft)) or company_draft if company_name != record['company_name']: record['company_name'] = company_name diff --git a/stackalytics/processor/record_processor.py b/stackalytics/processor/record_processor.py index 47bb7a0b2..30d8eabdd 100644 --- a/stackalytics/processor/record_processor.py +++ b/stackalytics/processor/record_processor.py @@ -430,7 +430,8 @@ class RecordProcessor(object): record['module'] = 'unknown' company_draft = record['company_draft'] - company_name = self.domains_index.get(company_draft) or company_draft + company_name = self.domains_index.get(utils.normalize_company_name( + company_draft)) or company_draft # author_email is a key to create new user record['author_email'] = user_id diff --git a/stackalytics/processor/utils.py b/stackalytics/processor/utils.py index ddc0e1709..d718ca4fa 100644 --- a/stackalytics/processor/utils.py +++ b/stackalytics/processor/utils.py @@ -189,3 +189,16 @@ def make_module_group(module_group_id, name=None, modules=None, tag='module'): 'module_group_name': name or module_group_id, 'modules': modules or [module_group_id], 'tag': tag} + +BAD_NAME_SUFFIXES = ['Ltd', 'Pvt', 'Inc', 'GmbH', 'AG', 'Corporation', 'Corp', + 'Company', 'Co', 'Group', 'Srl', 'Limited', 'LLC', 'IT'] + +BAD_NAME_SUFFIXES_WITH_STOPS = ['S.p.A.', 's.r.o.', 'L.P.', 'B.V.', 'K.K.', + 'd.o.o.'] + + +def normalize_company_name(name): + regex = '(\\b(' + '|'.join(BAD_NAME_SUFFIXES) + ')\\b)' + regex += '|' + '((^|\\s)(' + '|'.join(BAD_NAME_SUFFIXES_WITH_STOPS) + '))' + name = re.sub(re.compile(regex, re.IGNORECASE), '', name) + return ''.join([c.lower() for c in name if c.isalnum()]) diff --git a/tests/unit/test_utils.py b/tests/unit/test_utils.py index e173d1def..2c71e1e34 100644 --- a/tests/unit/test_utils.py +++ b/tests/unit/test_utils.py @@ -77,3 +77,16 @@ class TestUtils(testtools.TestCase): {'index': 1, 'name': 'C'}] self.assertEqual(expected, utils.add_index( sequence, start=0, item_filter=lambda x: x['name'] != 'B')) + + def test_normalize_company_name(self): + company_names = ['EMC Corporation', 'Abc, corp..', 'Mirantis IT.', + 'Red Hat, Inc.', 'abc s.r.o. ABC', '2s.r.o. co', + 'AL.P.B L.P. s.r.o. s.r.o. C ltd.'] + correct_normalized_company_names = ['emc', 'abc', 'mirantis', + 'redhat', 'abcabc', '2sro', + 'alpbc'] + normalized_company_names = [utils.normalize_company_name(name) + for name in company_names] + + self.assertEqual(normalized_company_names, + correct_normalized_company_names)