mirror of
https://github.com/metabarcoding/obitools4.git
synced 2025-06-29 08:10:45 +00:00
many changes ;-)
Former-commit-id: cb4aea844e960e4af4dc673ebc8eec49a7d12b13
This commit is contained in:
198
Example_Arth03.ngsfilter
Normal file
198
Example_Arth03.ngsfilter
Normal file
@ -0,0 +1,198 @@
|
||||
CIRA_Arth03 CIRA001_A acacacac:acacacac GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_01A;
|
||||
CIRA_Arth03 CIRA002_A acagcaca:acacacac GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_01B;
|
||||
CIRA_Arth03 CIRA003_A gtgtacat:acacacac GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_01C;
|
||||
CIRA_Arth03 CIRA004_A tatgtcag:acacacac GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_01D;
|
||||
CIRA_Arth03 CIRA005_A tagtcgca:acacacac GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_01E;
|
||||
CIRA_Arth03 CIRA006_A tactatac:acacacac GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_01F;
|
||||
CIRA_Arth03 CIRA007_A actagatc:acacacac GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_01G;
|
||||
CIRA_Arth03 BLNK001 gatcgcga:acacacac GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_01H;
|
||||
CIRA_Arth03 CIRA008_A acacacac:acagcaca GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_02A;
|
||||
CIRA_Arth03 CIRA009_A acagcaca:acagcaca GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_02B;
|
||||
CIRA_Arth03 CIRA010_A gtgtacat:acagcaca GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_02C;
|
||||
CIRA_Arth03 CPCR01_A tatgtcag:acagcaca GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_02D;
|
||||
CIRA_Arth03 CIRA011_A tagtcgca:acagcaca GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_02E;
|
||||
CIRA_Arth03 CIRA012_A tactatac:acagcaca GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_02F;
|
||||
CIRA_Arth03 BLNK002 actagatc:acagcaca GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_02G;
|
||||
CIRA_Arth03 CIRA013_A gatcgcga:acagcaca GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_02H;
|
||||
CIRA_Arth03 CIRA014_A acacacac:gtgtacat GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_03A;
|
||||
CIRA_Arth03 CIRA015_A acagcaca:gtgtacat GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_03B;
|
||||
CIRA_Arth03 CIRA016_A gtgtacat:gtgtacat GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_03C;
|
||||
CIRA_Arth03 CIRA017_A tatgtcag:gtgtacat GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_03D;
|
||||
CIRA_Arth03 CIRA018_A tagtcgca:gtgtacat GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_03E;
|
||||
CIRA_Arth03 BLNK003 tactatac:gtgtacat GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_03F;
|
||||
CIRA_Arth03 CIRA019_A actagatc:gtgtacat GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_03G;
|
||||
CIRA_Arth03 CIRA020_A gatcgcga:gtgtacat GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_03H;
|
||||
CIRA_Arth03 CIRA021_A acacacac:tatgtcag GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_04A;
|
||||
CIRA_Arth03 CIRA022_A acagcaca:tatgtcag GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_04B;
|
||||
CIRA_Arth03 CIRA023_A gtgtacat:tatgtcag GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_04C;
|
||||
CIRA_Arth03 CIRA024_A tatgtcag:tatgtcag GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_04D;
|
||||
CIRA_Arth03 BLNK004 tagtcgca:tatgtcag GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_04E;
|
||||
CIRA_Arth03 CIRA025_A tactatac:tatgtcag GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_04F;
|
||||
CIRA_Arth03 CIRA026_A actagatc:tatgtcag GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_04G;
|
||||
CIRA_Arth03 CIRA027_A gatcgcga:tatgtcag GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_04H;
|
||||
CIRA_Arth03 CPOS232_A acacacac:tagtcgca GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_05A;
|
||||
CIRA_Arth03 CIRA028_A acagcaca:tagtcgca GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_05B;
|
||||
CIRA_Arth03 CIRA029_A gtgtacat:tagtcgca GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_05C;
|
||||
CIRA_Arth03 BLNK005 tatgtcag:tagtcgca GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_05D;
|
||||
CIRA_Arth03 CIRA030_A tagtcgca:tagtcgca GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_05E;
|
||||
CIRA_Arth03 CIRA031_A tactatac:tagtcgca GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_05F;
|
||||
CIRA_Arth03 CIRA032_A actagatc:tagtcgca GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_05G;
|
||||
CIRA_Arth03 CIRA033_A gatcgcga:tagtcgca GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_05H;
|
||||
CIRA_Arth03 CIRA034_A acacacac:tactatac GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_06A;
|
||||
CIRA_Arth03 CIRA035_A acagcaca:tactatac GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_06B;
|
||||
CIRA_Arth03 BLNK006 gtgtacat:tactatac GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_06C;
|
||||
CIRA_Arth03 CIRA036_A tatgtcag:tactatac GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_06D;
|
||||
CIRA_Arth03 CIRA037_A tagtcgca:tactatac GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_06E;
|
||||
CIRA_Arth03 CIRA038_A tactatac:tactatac GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_06F;
|
||||
CIRA_Arth03 CIRA039_A actagatc:tactatac GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_06G;
|
||||
CIRA_Arth03 CIRA040_A gatcgcga:tactatac GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_06H;
|
||||
CIRA_Arth03 CIRA041_A acacacac:actagatc GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_07A;
|
||||
CIRA_Arth03 BLNK007 acagcaca:actagatc GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_07B;
|
||||
CIRA_Arth03 CIRA042_A gtgtacat:actagatc GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_07C;
|
||||
CIRA_Arth03 CIRA043_A tatgtcag:actagatc GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_07D;
|
||||
CIRA_Arth03 CIRA044_A tagtcgca:actagatc GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_07E;
|
||||
CIRA_Arth03 CIRA045_A tactatac:actagatc GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_07F;
|
||||
CIRA_Arth03 CIRA046_A actagatc:actagatc GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_07G;
|
||||
CIRA_Arth03 CIRA047_A gatcgcga:actagatc GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_07H;
|
||||
CIRA_Arth03 BLNK008 acacacac:gatcgcga GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_08A;
|
||||
CIRA_Arth03 CIRA048_A acagcaca:gatcgcga GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_08B;
|
||||
CIRA_Arth03 CIRA049_A gtgtacat:gatcgcga GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_08C;
|
||||
CIRA_Arth03 CIRA050_A tatgtcag:gatcgcga GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_08D;
|
||||
CIRA_Arth03 CIRA051_A tagtcgca:gatcgcga GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_08E;
|
||||
CIRA_Arth03 CPCR02_A tactatac:gatcgcga GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_08F;
|
||||
CIRA_Arth03 CIRA052_A actagatc:gatcgcga GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_08G;
|
||||
CIRA_Arth03 CIRA053_A gatcgcga:gatcgcga GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_08H;
|
||||
CIRA_Arth03 CIRA054_A acacacac:cgctctcg GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_09A;
|
||||
CIRA_Arth03 CPOS241_A acagcaca:cgctctcg GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=01_09B;
|
||||
CIRA_Arth03 CIRA001_B cgctctcg:acacacac GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_01A;
|
||||
CIRA_Arth03 CIRA002_B gtcgtaga:acacacac GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_01B;
|
||||
CIRA_Arth03 CIRA003_B gtcacgtc:acacacac GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_01C;
|
||||
CIRA_Arth03 CIRA004_B gactgatg:acacacac GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_01D;
|
||||
CIRA_Arth03 CIRA005_B agactatg:acacacac GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_01E;
|
||||
CIRA_Arth03 CIRA006_B gcgtcagc:acacacac GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_01F;
|
||||
CIRA_Arth03 CIRA007_B tgacatca:acacacac GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_01G;
|
||||
CIRA_Arth03 BLNK009 acatgtgt:acacacac GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_01H;
|
||||
CIRA_Arth03 CIRA008_B cgctctcg:acagcaca GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_02A;
|
||||
CIRA_Arth03 CIRA009_B gtcgtaga:acagcaca GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_02B;
|
||||
CIRA_Arth03 CIRA010_B gtcacgtc:acagcaca GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_02C;
|
||||
CIRA_Arth03 CPCR01_B gactgatg:acagcaca GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_02D;
|
||||
CIRA_Arth03 CIRA011_B agactatg:acagcaca GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_02E;
|
||||
CIRA_Arth03 CIRA012_B gcgtcagc:acagcaca GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_02F;
|
||||
CIRA_Arth03 BLNK010 tgacatca:acagcaca GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_02G;
|
||||
CIRA_Arth03 CIRA013_B acatgtgt:acagcaca GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_02H;
|
||||
CIRA_Arth03 CIRA014_B cgctctcg:gtgtacat GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_03A;
|
||||
CIRA_Arth03 CIRA015_B gtcgtaga:gtgtacat GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_03B;
|
||||
CIRA_Arth03 CIRA016_B gtcacgtc:gtgtacat GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_03C;
|
||||
CIRA_Arth03 CIRA017_B gactgatg:gtgtacat GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_03D;
|
||||
CIRA_Arth03 CIRA018_B agactatg:gtgtacat GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_03E;
|
||||
CIRA_Arth03 BLNK011 gcgtcagc:gtgtacat GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_03F;
|
||||
CIRA_Arth03 CIRA019_B tgacatca:gtgtacat GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_03G;
|
||||
CIRA_Arth03 CIRA020_B acatgtgt:gtgtacat GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_03H;
|
||||
CIRA_Arth03 CIRA021_B cgctctcg:tatgtcag GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_04A;
|
||||
CIRA_Arth03 CIRA022_B gtcgtaga:tatgtcag GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_04B;
|
||||
CIRA_Arth03 CIRA023_B gtcacgtc:tatgtcag GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_04C;
|
||||
CIRA_Arth03 CIRA024_B gactgatg:tatgtcag GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_04D;
|
||||
CIRA_Arth03 BLNK012 agactatg:tatgtcag GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_04E;
|
||||
CIRA_Arth03 CIRA025_B gcgtcagc:tatgtcag GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_04F;
|
||||
CIRA_Arth03 CIRA026_B tgacatca:tatgtcag GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_04G;
|
||||
CIRA_Arth03 CIRA027_B acatgtgt:tatgtcag GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_04H;
|
||||
CIRA_Arth03 CPOS232_B cgctctcg:tagtcgca GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_05A;
|
||||
CIRA_Arth03 CIRA028_B gtcgtaga:tagtcgca GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_05B;
|
||||
CIRA_Arth03 CIRA029_B gtcacgtc:tagtcgca GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_05C;
|
||||
CIRA_Arth03 BLNK013 gactgatg:tagtcgca GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_05D;
|
||||
CIRA_Arth03 CIRA030_B agactatg:tagtcgca GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_05E;
|
||||
CIRA_Arth03 CIRA031_B gcgtcagc:tagtcgca GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_05F;
|
||||
CIRA_Arth03 CIRA032_B tgacatca:tagtcgca GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_05G;
|
||||
CIRA_Arth03 CIRA033_B acatgtgt:tagtcgca GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_05H;
|
||||
CIRA_Arth03 CIRA034_B cgctctcg:tactatac GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_06A;
|
||||
CIRA_Arth03 CIRA035_B gtcgtaga:tactatac GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_06B;
|
||||
CIRA_Arth03 BLNK014 gtcacgtc:tactatac GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_06C;
|
||||
CIRA_Arth03 CIRA036_B gactgatg:tactatac GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_06D;
|
||||
CIRA_Arth03 CIRA037_B agactatg:tactatac GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_06E;
|
||||
CIRA_Arth03 CIRA038_B gcgtcagc:tactatac GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_06F;
|
||||
CIRA_Arth03 CIRA039_B tgacatca:tactatac GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_06G;
|
||||
CIRA_Arth03 CIRA040_B acatgtgt:tactatac GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_06H;
|
||||
CIRA_Arth03 CIRA041_B cgctctcg:actagatc GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_07A;
|
||||
CIRA_Arth03 BLNK015 gtcgtaga:actagatc GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_07B;
|
||||
CIRA_Arth03 CIRA042_B gtcacgtc:actagatc GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_07C;
|
||||
CIRA_Arth03 CIRA043_B gactgatg:actagatc GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_07D;
|
||||
CIRA_Arth03 CIRA044_B agactatg:actagatc GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_07E;
|
||||
CIRA_Arth03 CIRA045_B gcgtcagc:actagatc GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_07F;
|
||||
CIRA_Arth03 CIRA046_B tgacatca:actagatc GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_07G;
|
||||
CIRA_Arth03 CIRA047_B acatgtgt:actagatc GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_07H;
|
||||
CIRA_Arth03 BLNK016 cgctctcg:gatcgcga GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_08A;
|
||||
CIRA_Arth03 CIRA048_B gtcgtaga:gatcgcga GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_08B;
|
||||
CIRA_Arth03 CIRA049_B gtcacgtc:gatcgcga GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_08C;
|
||||
CIRA_Arth03 CIRA050_B gactgatg:gatcgcga GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_08D;
|
||||
CIRA_Arth03 CIRA051_B agactatg:gatcgcga GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_08E;
|
||||
CIRA_Arth03 CPCR02_B gcgtcagc:gatcgcga GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_08F;
|
||||
CIRA_Arth03 CIRA052_B tgacatca:gatcgcga GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_08G;
|
||||
CIRA_Arth03 CIRA053_B acatgtgt:gatcgcga GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_08H;
|
||||
CIRA_Arth03 CIRA054_B cgctctcg:cgctctcg GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_09A;
|
||||
CIRA_Arth03 CPOS241_B gtcgtaga:cgctctcg GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=02_09B;
|
||||
CIRA_Arth03 CIRA002_C atgatcgc:acacacac GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=03_01B;
|
||||
CIRA_Arth03 CIRA003_C acgacgag:acacacac GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=03_01C;
|
||||
CIRA_Arth03 CIRA004_C catcagtc:acacacac GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=03_01D;
|
||||
CIRA_Arth03 CIRA005_C atcagtca:acacacac GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=03_01E;
|
||||
CIRA_Arth03 CIRA006_C tctactga:acacacac GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=03_01F;
|
||||
CIRA_Arth03 CIRA007_C gatgatct:acacacac GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=03_01G;
|
||||
CIRA_Arth03 CIRA009_C atgatcgc:acagcaca GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=03_02B;
|
||||
CIRA_Arth03 CIRA010_C acgacgag:acagcaca GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=03_02C;
|
||||
CIRA_Arth03 CPCR01_C catcagtc:acagcaca GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=03_02D;
|
||||
CIRA_Arth03 CIRA011_C atcagtca:acagcaca GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=03_02E;
|
||||
CIRA_Arth03 CIRA012_C tctactga:acagcaca GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=03_02F;
|
||||
CIRA_Arth03 BLNK018 gatgatct:acagcaca GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=03_02G;
|
||||
CIRA_Arth03 CIRA015_C atgatcgc:gtgtacat GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=03_03B;
|
||||
CIRA_Arth03 CIRA016_C acgacgag:gtgtacat GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=03_03C;
|
||||
CIRA_Arth03 CIRA017_C catcagtc:gtgtacat GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=03_03D;
|
||||
CIRA_Arth03 CIRA018_C atcagtca:gtgtacat GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=03_03E;
|
||||
CIRA_Arth03 BLNK019 tctactga:gtgtacat GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=03_03F;
|
||||
CIRA_Arth03 CIRA019_C gatgatct:gtgtacat GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=03_03G;
|
||||
CIRA_Arth03 CIRA022_C atgatcgc:tatgtcag GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=03_04B;
|
||||
CIRA_Arth03 CIRA023_C acgacgag:tatgtcag GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=03_04C;
|
||||
CIRA_Arth03 CIRA024_C catcagtc:tatgtcag GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=03_04D;
|
||||
CIRA_Arth03 BLNK020 atcagtca:tatgtcag GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=03_04E;
|
||||
CIRA_Arth03 CIRA025_C tctactga:tatgtcag GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=03_04F;
|
||||
CIRA_Arth03 CIRA026_C gatgatct:tatgtcag GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=03_04G;
|
||||
CIRA_Arth03 CIRA028_C atgatcgc:tagtcgca GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=03_05B;
|
||||
CIRA_Arth03 CIRA029_C acgacgag:tagtcgca GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=03_05C;
|
||||
CIRA_Arth03 BLNK021 catcagtc:tagtcgca GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=03_05D;
|
||||
CIRA_Arth03 CIRA030_C atcagtca:tagtcgca GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=03_05E;
|
||||
CIRA_Arth03 CIRA031_C tctactga:tagtcgca GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=03_05F;
|
||||
CIRA_Arth03 CIRA032_C gatgatct:tagtcgca GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=03_05G;
|
||||
CIRA_Arth03 CIRA035_C atgatcgc:tactatac GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=03_06B;
|
||||
CIRA_Arth03 BLNK022 acgacgag:tactatac GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=03_06C;
|
||||
CIRA_Arth03 CIRA036_C catcagtc:tactatac GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=03_06D;
|
||||
CIRA_Arth03 CIRA037_C atcagtca:tactatac GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=03_06E;
|
||||
CIRA_Arth03 CIRA038_C tctactga:tactatac GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=03_06F;
|
||||
CIRA_Arth03 CIRA039_C gatgatct:tactatac GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=03_06G;
|
||||
CIRA_Arth03 BLNK023 atgatcgc:actagatc GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=03_07B;
|
||||
CIRA_Arth03 CIRA042_C acgacgag:actagatc GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=03_07C;
|
||||
CIRA_Arth03 CIRA043_C catcagtc:actagatc GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=03_07D;
|
||||
CIRA_Arth03 CIRA044_C atcagtca:actagatc GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=03_07E;
|
||||
CIRA_Arth03 CIRA045_C tctactga:actagatc GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=03_07F;
|
||||
CIRA_Arth03 CIRA046_C gatgatct:actagatc GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=03_07G;
|
||||
CIRA_Arth03 CIRA048_C atgatcgc:gatcgcga GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=03_08B;
|
||||
CIRA_Arth03 CIRA049_C acgacgag:gatcgcga GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=03_08C;
|
||||
CIRA_Arth03 CIRA050_C catcagtc:gatcgcga GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=03_08D;
|
||||
CIRA_Arth03 CIRA051_C atcagtca:gatcgcga GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=03_08E;
|
||||
CIRA_Arth03 CPCR02_C tctactga:gatcgcga GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=03_08F;
|
||||
CIRA_Arth03 CIRA052_C gatgatct:gatcgcga GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=03_08G;
|
||||
CIRA_Arth03 CPOS241_C atgatcgc:cgctctcg GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=03_09B;
|
||||
CIRA_Arth03 CIRA001_C acacacac:agactatg GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=05_01A;
|
||||
CIRA_Arth03 BLNK017 gatcgcga:agactatg GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=05_01H;
|
||||
CIRA_Arth03 CIRA008_C acacacac:gcgtcagc GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=05_02A;
|
||||
CIRA_Arth03 CIRA013_C gatcgcga:gcgtcagc GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=05_02H;
|
||||
CIRA_Arth03 CIRA014_C acacacac:tgacatca GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=05_03A;
|
||||
CIRA_Arth03 CIRA020_C gatcgcga:tgacatca GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=05_03H;
|
||||
CIRA_Arth03 CIRA021_C acacacac:acatgtgt GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=05_04A;
|
||||
CIRA_Arth03 CIRA027_C gatcgcga:acatgtgt GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=05_04H;
|
||||
CIRA_Arth03 CPOS232_C acacacac:gtacgact GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=05_05A;
|
||||
CIRA_Arth03 CIRA033_C gatcgcga:gtacgact GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=05_05H;
|
||||
CIRA_Arth03 CIRA034_C acacacac:atgatcgc GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=05_06A;
|
||||
CIRA_Arth03 CIRA040_C gatcgcga:atgatcgc GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=05_06H;
|
||||
CIRA_Arth03 CIRA041_C acacacac:acgacgag GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=05_07A;
|
||||
CIRA_Arth03 CIRA047_C gatcgcga:acgacgag GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=05_07H;
|
||||
CIRA_Arth03 BLNK024 acacacac:catcagtc GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=05_08A;
|
||||
CIRA_Arth03 CIRA053_C gatcgcga:catcagtc GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=05_08H;
|
||||
CIRA_Arth03 CIRA054_C acacacac:atcagtca GATAGAAACCRACCTGGYT GCGACCTCGATGTTGRATT F @ position=05_09A;
|
409337
SPER01.csv
Normal file
409337
SPER01.csv
Normal file
File diff suppressed because it is too large
Load Diff
1
SPER03.csv.REMOVED.git-id
Normal file
1
SPER03.csv.REMOVED.git-id
Normal file
@ -0,0 +1 @@
|
||||
412536492810152d7835808871ea2b0289a770f7
|
BIN
doc/.DS_Store
vendored
BIN
doc/.DS_Store
vendored
Binary file not shown.
6
doc/book/.ipynb_checkpoints/Untitled-checkpoint.ipynb
Normal file
6
doc/book/.ipynb_checkpoints/Untitled-checkpoint.ipynb
Normal file
@ -0,0 +1,6 @@
|
||||
{
|
||||
"cells": [],
|
||||
"metadata": {},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
6
doc/book/Untitled.ipynb
Normal file
6
doc/book/Untitled.ipynb
Normal file
@ -0,0 +1,6 @@
|
||||
{
|
||||
"cells": [],
|
||||
"metadata": {},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
118
doc/book/wolf_data/sort_gb_order.sh
Executable file
118
doc/book/wolf_data/sort_gb_order.sh
Executable file
@ -0,0 +1,118 @@
|
||||
#!/bin/bash
|
||||
#!/bin/bash
|
||||
#OAR -n gbsort
|
||||
##OAR --array 50
|
||||
##OAR --array-param-file 50_first.tsv
|
||||
#OAR --project phyloalps
|
||||
#OAR -l nodes=1/core=10,walltime=24:00:00
|
||||
#OAR -O gbsort.%jobid%.log
|
||||
#OAR -E gbsort.%jobid%.log
|
||||
|
||||
|
||||
# /silenus/PROJECTS/pr-phyloalps/coissac
|
||||
# /bettik/LECA/ENVIRONMENT/data/biodatabase/genbank
|
||||
|
||||
#
|
||||
# Used resources URLs
|
||||
#
|
||||
|
||||
NCBIURL="https://ftp.ncbi.nlm.nih.gov/" # NCBI Web site URL
|
||||
GBURL="${NCBIURL}genbank/" # Directory of Genbank flat files
|
||||
TAXOURL="${NCBIURL}pub/taxonomy/taxdump.tar.gz" # NCBI Taxdump
|
||||
|
||||
LOGFILE="download.log"
|
||||
|
||||
#
|
||||
# List of downloaded Genbank divisions
|
||||
#
|
||||
|
||||
DIV="bct|inv|mam|phg|pln|pri|rod|vrl|vrt"
|
||||
|
||||
############################
|
||||
#
|
||||
# Functions
|
||||
#
|
||||
############################
|
||||
|
||||
pattern_at_rank() {
|
||||
local taxo="$1"
|
||||
local rank="$2"
|
||||
|
||||
echo "^($(awk -F "|" -v rank="$rank" 'BEGIN {
|
||||
ORS="|";
|
||||
rank="\t" rank "\t"
|
||||
}
|
||||
($3 ~ rank) {sub(/^[ \t]+/,"",$1);
|
||||
sub(/[ \t]+$/,"",$1);
|
||||
print $1}
|
||||
' "${taxo}/nodes.dmp" \
|
||||
| sed 's/|$//'))$"
|
||||
}
|
||||
|
||||
|
||||
GBDIR=$1
|
||||
|
||||
#
|
||||
# Extrate from the web site the current Genbank release number
|
||||
# end create the corresponding directory
|
||||
#
|
||||
|
||||
echo "Looking at current Genbank release number"
|
||||
GB_Release_Number=$(for r in $(ls -d "${GBDIR}/Release-"* ); do
|
||||
basename $r;
|
||||
done \
|
||||
| sort -r \
|
||||
| head -1 \
|
||||
| sed 's/^Release-//')
|
||||
|
||||
GB_Release_Number=251
|
||||
|
||||
echo "identified latest release number is : ${GB_Release_Number}"
|
||||
|
||||
GBSOURCE="${GBDIR}/Release-${GB_Release_Number}"
|
||||
|
||||
mkdir -p "Release-${GB_Release_Number}"
|
||||
cd "Release-${GB_Release_Number}" || exit
|
||||
|
||||
#
|
||||
# Download the current NCBI taxonomy
|
||||
#
|
||||
mkdir -p "ncbitaxo"
|
||||
|
||||
if [[ ! -f ncbitaxo/nodes.dmp ]] || [[ ! -f ncbitaxo/names.dmp ]] ; then
|
||||
curl "${TAXOURL}" \
|
||||
| tar -C "ncbitaxo" -zxf -
|
||||
fi
|
||||
|
||||
|
||||
for f in $(ls -1 "${GBSOURCE}/"*.seq.gz ) ; do
|
||||
|
||||
echo "PROCESSING : $f saved into $fasta" $(pwd)
|
||||
|
||||
obiannotate --genbank -t ncbitaxo \
|
||||
--with-taxon-at-rank kingdom \
|
||||
--with-taxon-at-rank superkingdom \
|
||||
--with-taxon-at-rank phylum\
|
||||
--with-taxon-at-rank order \
|
||||
--with-taxon-at-rank family \
|
||||
--with-taxon-at-rank genus \
|
||||
-S division='"misc-@-0"' \
|
||||
-S section='"misc-@-0"' \
|
||||
"$f" \
|
||||
| obigrep -A genus_taxid -A family_taxid \
|
||||
| obigrep -p 'annotations.genus_taxid > 0 && annotations.family_taxid > 0' \
|
||||
-p 'annotations.phylum_taxid > 0 || annotations.order_taxid > 0' \
|
||||
| obiannotate -p 'annotations.superkingdom_taxid > 0' \
|
||||
-S division='printf("%s-S-%d",subspc(annotations.superkingdom_name),annotations.superkingdom_taxid)' \
|
||||
| obiannotate -p 'annotations.kingdom_taxid > 0' \
|
||||
-S division='printf("%s-K-%d",subspc(annotations.kingdom_name),annotations.kingdom_taxid)' \
|
||||
| obiannotate -p 'annotations.phylum_taxid > 0' \
|
||||
-S section='printf("%s-P-%d",subspc(annotations.phylum_name),annotations.phylum_taxid)' \
|
||||
| obiannotate -p 'annotations.order_taxid > 0' \
|
||||
-S section='printf("%s-O-%d",subspc(annotations.order_name),annotations.order_taxid)' \
|
||||
| obidistribute -Z -A -p "%s.fasta" -c section -d division
|
||||
|
||||
|
||||
|
||||
done
|
||||
|
31
obitools4/Dockerfile
Normal file
31
obitools4/Dockerfile
Normal file
@ -0,0 +1,31 @@
|
||||
FROM ubuntu:lunar as builder
|
||||
LABEL dockerfile.version="1"
|
||||
LABEL software="obitools4"
|
||||
|
||||
WORKDIR /
|
||||
RUN apt update --fix-missing && apt upgrade -y
|
||||
RUN apt install -y build-essential
|
||||
RUN apt install -y git tcsh bash gawk parallel gettext zlib1g-dev libglib2.0-0
|
||||
# RUN git clone https://git.metabarcoding.org/org-asm/org-annotate.git
|
||||
# RUN cd org-annotate/src && make && cd ../..
|
||||
# RUN cd /org-annotate/data/its/ITSx_db/HMMs && \
|
||||
# rm *.h3* && \
|
||||
# for f in *.hmm ; do /org-annotate/ports/i386-linux/bin/hmmpress $f ; done
|
||||
|
||||
RUN apt install -y python3-pip python3-dev python3-venv
|
||||
RUN git clone https://git.metabarcoding.org/org-asm/org-asm.git
|
||||
RUN cd org-asm
|
||||
RUN python3 -m venv ../org-assembler
|
||||
RUN bash -c '../org-assembler/bin/pip3 install -r /org-asm/requirements.txt'
|
||||
RUN bash -c '../org-assembler/bin/python3 setup.py install --no-serenity'
|
||||
RUN cd ..
|
||||
|
||||
|
||||
# FROM ubuntu:lunar as phyloskims
|
||||
# WORKDIR /
|
||||
# RUN apt update --fix-missing && apt upgrade -y
|
||||
# RUN apt install -y tcsh bash gawk parallel zlib1g libglib2.0-0
|
||||
# COPY --from=builder /org-annotate /org-annotate
|
||||
# RUN rm -rf /org-annotate/src
|
||||
# COPY --from=builder /org-assembler /org-assembler
|
||||
# RUN mkdir -p /data
|
2
pkg/obiformats/fastseq_write_with_index.go
Normal file
2
pkg/obiformats/fastseq_write_with_index.go
Normal file
@ -0,0 +1,2 @@
|
||||
package obiformats
|
||||
|
@ -3,93 +3,190 @@ package obistats
|
||||
import (
|
||||
"math"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"golang.org/x/exp/rand"
|
||||
"gonum.org/v1/gonum/stat/sampleuv"
|
||||
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
|
||||
log "github.com/sirupsen/logrus"
|
||||
)
|
||||
|
||||
// AssignToClass applies the nearest neighbor algorithm to assign data points to classes.
|
||||
//
|
||||
// Parameters:
|
||||
// - data: a 2D slice of float64 representing the data points to be assigned.
|
||||
// - centers: a 2D slice of float64 representing the center points for each class.
|
||||
//
|
||||
// Return:
|
||||
// - classes: a slice of int representing the assigned class for each data point.
|
||||
func AssignToClass(data, centers *obiutils.Matrix[float64]) []int {
|
||||
classes := make([]int, len(*data))
|
||||
numData := len(*data)
|
||||
numCenters := len(*centers)
|
||||
|
||||
var wg sync.WaitGroup
|
||||
wg.Add(numData)
|
||||
|
||||
for i := 0; i < numData; i++ {
|
||||
go func(i int) {
|
||||
defer wg.Done()
|
||||
minDist := math.MaxFloat64
|
||||
minDistIndex := -1
|
||||
rowData := (*data)[i]
|
||||
|
||||
for j := 0; j < numCenters; j++ {
|
||||
centerData := (*centers)[j]
|
||||
dist := 0.0
|
||||
|
||||
for d, val := range rowData {
|
||||
diff := val - centerData[d]
|
||||
dist += diff * diff
|
||||
}
|
||||
|
||||
if dist < minDist {
|
||||
minDist = dist
|
||||
minDistIndex = j
|
||||
}
|
||||
}
|
||||
|
||||
classes[i] = minDistIndex
|
||||
}(i)
|
||||
func squareDist(a, b []float64) float64 {
|
||||
sum := 0.0
|
||||
for i := 0; i < len(a); i++ {
|
||||
diff := a[i] - b[i]
|
||||
sum += diff * diff
|
||||
}
|
||||
|
||||
wg.Wait()
|
||||
|
||||
return classes
|
||||
return sum
|
||||
}
|
||||
|
||||
// ComputeCenters calculates the centers of clusters for a given data set.
|
||||
//
|
||||
// Parameters:
|
||||
// - data: a pointer to a matrix of float64 values representing the data set.
|
||||
// - k: an integer representing the number of clusters.
|
||||
// - classes: a slice of integers representing the assigned cluster for each data point.
|
||||
//
|
||||
// Returns:
|
||||
// - centers: a pointer to a matrix of float64 values representing the centers of the clusters.
|
||||
// ComputeCenters calculates the centers of clusters for a given data set.
|
||||
//
|
||||
// Parameters:
|
||||
// - data: a pointer to a matrix of float64 values representing the data set.
|
||||
// - k: an integer representing the number of clusters.
|
||||
// - classes: a slice of integers representing the assigned cluster for each data point.
|
||||
//
|
||||
// Returns:
|
||||
// - centers: a pointer to a matrix of float64 values representing the centers of the clusters.
|
||||
func ComputeCenters(data *obiutils.Matrix[float64], k int, classes []int) *obiutils.Matrix[float64] {
|
||||
centers := obiutils.Make2DNumericArray[float64](k, len((*data)[0]), true)
|
||||
ns := make([]int, k)
|
||||
func DefaultRG() *rand.Rand {
|
||||
return rand.New(rand.NewSource(uint64(time.Now().UnixNano())))
|
||||
}
|
||||
|
||||
var wg sync.WaitGroup
|
||||
type KmeansClustering struct {
|
||||
data *obiutils.Matrix[float64]
|
||||
rg *rand.Rand
|
||||
centers obiutils.Matrix[float64]
|
||||
icenters []int
|
||||
sizes []int
|
||||
distmin []float64
|
||||
classes []int
|
||||
}
|
||||
|
||||
for i := range ns {
|
||||
ns[i] = 0
|
||||
func MakeKmeansClustering(data *obiutils.Matrix[float64], k int, rg *rand.Rand) *KmeansClustering {
|
||||
distmin := make([]float64, len(*data))
|
||||
for i := 0; i < len(distmin); i++ {
|
||||
distmin[i] = math.MaxFloat64
|
||||
}
|
||||
|
||||
clustering := &KmeansClustering{
|
||||
data: data,
|
||||
icenters: make([]int, 0, k),
|
||||
sizes: make([]int, 0, k),
|
||||
centers: make(obiutils.Matrix[float64], 0, k),
|
||||
distmin: distmin,
|
||||
classes: make([]int, len(*data)),
|
||||
rg: rg,
|
||||
}
|
||||
|
||||
for i := 0; i < k; i++ {
|
||||
clustering.AddACenter()
|
||||
}
|
||||
|
||||
return clustering
|
||||
}
|
||||
|
||||
// K returns the number of clusters in the K-means clustering algorithm.
|
||||
//
|
||||
// No parameters.
|
||||
// Returns an integer.
|
||||
func (clustering *KmeansClustering) K() int {
|
||||
return len(clustering.icenters)
|
||||
}
|
||||
|
||||
// N returns the size of the dataset in the KmeansClustering instance.
|
||||
//
|
||||
// It does not take any parameters.
|
||||
// The return type is an integer.
|
||||
func (clustering *KmeansClustering) N() int {
|
||||
return len(*clustering.data)
|
||||
}
|
||||
|
||||
// Dimension returns the dimension of the KmeansClustering data.
|
||||
//
|
||||
// No parameters.
|
||||
// Returns an integer representing the dimension of the data.
|
||||
func (clustering *KmeansClustering) Dimension() int {
|
||||
return len((*clustering.data)[0])
|
||||
}
|
||||
func (clustering *KmeansClustering) AddACenter() {
|
||||
C := 0
|
||||
if clustering.K() == 0 {
|
||||
C = rand.Intn(clustering.N())
|
||||
} else {
|
||||
w := sampleuv.NewWeighted(clustering.distmin, clustering.rg)
|
||||
C, _ = w.Take()
|
||||
}
|
||||
clustering.icenters = append(clustering.icenters, C)
|
||||
clustering.sizes = append(clustering.sizes, 0)
|
||||
center := (*clustering.data)[C]
|
||||
clustering.centers = append(clustering.centers, center)
|
||||
|
||||
n := clustering.N()
|
||||
|
||||
for i := 0; i < n; i++ {
|
||||
d := squareDist((*clustering.data)[i], center)
|
||||
if d < clustering.distmin[i] {
|
||||
clustering.distmin[i] = d
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ResetEmptyCenters resets the empty centers in the KmeansClustering struct.
|
||||
//
|
||||
// It iterates over the centers and checks if their corresponding sizes are zero.
|
||||
// If a center is empty, a new weighted sample is taken with the help of the distmin and rg variables.
|
||||
// The new center is then assigned to the empty center index, and the sizes and centers arrays are updated accordingly.
|
||||
// Finally, the function returns the number of empty centers that were reset.
|
||||
func (clustering *KmeansClustering) ResetEmptyCenters() int {
|
||||
nreset := 0
|
||||
for i := 0; i < clustering.K(); i++ {
|
||||
if clustering.sizes[i] == 0 {
|
||||
w := sampleuv.NewWeighted(clustering.distmin, clustering.rg)
|
||||
C, _ := w.Take()
|
||||
clustering.icenters[i] = C
|
||||
clustering.centers[i] = (*clustering.data)[C]
|
||||
nreset++
|
||||
}
|
||||
}
|
||||
return nreset
|
||||
}
|
||||
|
||||
// AssignToClass assigns each data point to a class based on the distance to the nearest center.
|
||||
//
|
||||
// This function does not take any parameters.
|
||||
// It does not return anything.
|
||||
func (clustering *KmeansClustering) AssignToClass() {
|
||||
var wg sync.WaitGroup
|
||||
var lock sync.Mutex
|
||||
|
||||
for i := 0; i < clustering.K(); i++ {
|
||||
clustering.sizes[i] = 0
|
||||
}
|
||||
for i := 0; i < clustering.N(); i++ {
|
||||
clustering.distmin[i] = math.MaxFloat64
|
||||
}
|
||||
|
||||
goroutine := func(i int) {
|
||||
defer wg.Done()
|
||||
dmin := math.MaxFloat64
|
||||
cmin := -1
|
||||
for j, center := range clustering.centers {
|
||||
dist := squareDist((*clustering.data)[i], center)
|
||||
if dist < dmin {
|
||||
dmin = dist
|
||||
cmin = j
|
||||
}
|
||||
}
|
||||
lock.Lock()
|
||||
clustering.classes[i] = cmin
|
||||
clustering.sizes[cmin]++
|
||||
clustering.distmin[i] = dmin
|
||||
lock.Unlock()
|
||||
}
|
||||
|
||||
wg.Add(clustering.N())
|
||||
for i := 0; i < clustering.N(); i++ {
|
||||
go goroutine(i)
|
||||
}
|
||||
|
||||
nreset := clustering.ResetEmptyCenters()
|
||||
|
||||
if nreset > 0 {
|
||||
log.Warnf("Reset %d empty centers", nreset)
|
||||
clustering.AssignToClass()
|
||||
}
|
||||
}
|
||||
|
||||
// ComputeCenters calculates the centers of the K-means clustering algorithm.
|
||||
//
|
||||
// It takes no parameters.
|
||||
// It does not return any values.
|
||||
func (clustering *KmeansClustering) ComputeCenters() {
|
||||
var wg sync.WaitGroup
|
||||
centers := clustering.centers
|
||||
data := clustering.data
|
||||
classes := clustering.classes
|
||||
k := clustering.K()
|
||||
|
||||
// Goroutine code
|
||||
goroutine := func(centerIdx int) {
|
||||
goroutine1 := func(centerIdx int) {
|
||||
defer wg.Done()
|
||||
for j, row := range *data {
|
||||
class := classes[j]
|
||||
if class == centerIdx {
|
||||
ns[centerIdx]++
|
||||
for l, val := range row {
|
||||
centers[centerIdx][l] += val
|
||||
}
|
||||
@ -99,149 +196,168 @@ func ComputeCenters(data *obiutils.Matrix[float64], k int, classes []int) *obiut
|
||||
|
||||
for i := 0; i < k; i++ {
|
||||
wg.Add(1)
|
||||
go goroutine(i)
|
||||
go goroutine1(i)
|
||||
}
|
||||
|
||||
wg.Wait()
|
||||
|
||||
for i := range centers {
|
||||
for j := range centers[i] {
|
||||
centers[i][j] /= float64(ns[i])
|
||||
centers[i][j] /= float64(clustering.sizes[i])
|
||||
}
|
||||
}
|
||||
|
||||
return ¢ers
|
||||
}
|
||||
|
||||
// ComputeInertia computes the inertia of the given data and centers in parallel.
|
||||
//
|
||||
// Parameters:
|
||||
// - data: A pointer to a Matrix of float64 representing the data.
|
||||
// - classes: A slice of int representing the class labels for each data point.
|
||||
// - centers: A pointer to a Matrix of float64 representing the centers.
|
||||
//
|
||||
// Return type:
|
||||
// - float64: The computed inertia.
|
||||
func ComputeInertia(data *obiutils.Matrix[float64], classes []int, centers *obiutils.Matrix[float64]) float64 {
|
||||
inertia := make(chan float64)
|
||||
numRows := len(*data)
|
||||
wg := sync.WaitGroup{}
|
||||
wg.Add(numRows)
|
||||
|
||||
for i := 0; i < numRows; i++ {
|
||||
go func(i int) {
|
||||
defer wg.Done()
|
||||
row := (*data)[i]
|
||||
class := classes[i]
|
||||
center := (*centers)[class]
|
||||
inertiaLocal := 0.0
|
||||
for j, val := range row {
|
||||
diff := val - center[j]
|
||||
inertiaLocal += diff * diff
|
||||
goroutine2 := func(centerIdx int) {
|
||||
defer wg.Done()
|
||||
dkmin := math.MaxFloat64
|
||||
dki := -1
|
||||
center := centers[centerIdx]
|
||||
for j, row := range *data {
|
||||
if classes[j] == centerIdx {
|
||||
dist := squareDist(row, center)
|
||||
if dist < dkmin {
|
||||
dkmin = dist
|
||||
dki = j
|
||||
}
|
||||
}
|
||||
inertia <- inertiaLocal
|
||||
}(i)
|
||||
}
|
||||
|
||||
go func() {
|
||||
wg.Wait()
|
||||
close(inertia)
|
||||
}()
|
||||
|
||||
totalInertia := 0.0
|
||||
for localInertia := range inertia {
|
||||
totalInertia += localInertia
|
||||
}
|
||||
|
||||
return totalInertia
|
||||
}
|
||||
|
||||
// Kmeans performs the K-means clustering algorithm on the given data.
|
||||
//
|
||||
// if centers and *center is not nil, centers is considered as initialized
|
||||
// and the number of classes (k) is set to the number of rows in centers.
|
||||
// overwise, the number of classes is defined by the value of k.
|
||||
//
|
||||
// Parameters:
|
||||
// - data: A pointer to a Matrix[float64] that represents the input data.
|
||||
// - k: An integer that specifies the number of clusters to create.
|
||||
// - threshold: A float64 value that determines the convergence threshold.
|
||||
// - centers: A pointer to a Matrix[float64] that represents the initial cluster centers.
|
||||
//
|
||||
// Returns:
|
||||
// - classes: A slice of integers that assigns each data point to a cluster.
|
||||
// - centers: A pointer to a Matrix[float64] that contains the final cluster centers.
|
||||
// - inertia: A float64 value that represents the overall inertia of the clustering.
|
||||
// - converged: A boolean value indicating whether the algorithm converged.
|
||||
func Kmeans(data *obiutils.Matrix[float64],
|
||||
k int,
|
||||
threshold float64,
|
||||
centers *obiutils.Matrix[float64]) ([]int, *obiutils.Matrix[float64], float64, bool) {
|
||||
if centers == nil || *centers == nil {
|
||||
*centers = obiutils.Make2DArray[float64](k, len((*data)[0]))
|
||||
center_ids := SampleIntWithoutReplacement(k, len(*data))
|
||||
for i, id := range center_ids {
|
||||
(*centers)[i] = (*data)[id]
|
||||
}
|
||||
} else {
|
||||
k = len(*centers)
|
||||
clustering.icenters[centerIdx] = dki
|
||||
clustering.centers[centerIdx] = (*data)[dki]
|
||||
}
|
||||
|
||||
classes := AssignToClass(data, centers)
|
||||
centers = ComputeCenters(data, k, classes)
|
||||
inertia := ComputeInertia(data, classes, centers)
|
||||
delta := threshold * 100.0
|
||||
for i := 0; i < 100 && delta > threshold; i++ {
|
||||
classes = AssignToClass(data, centers)
|
||||
centers = ComputeCenters(data, k, classes)
|
||||
newi := ComputeInertia(data, classes, centers)
|
||||
delta = inertia - newi
|
||||
inertia = newi
|
||||
log.Debugf("Inertia: %f, delta: %f", inertia, delta)
|
||||
}
|
||||
|
||||
return classes, centers, inertia, delta < threshold
|
||||
}
|
||||
|
||||
// KmeansBestRepresentative finds the best representative among the data point of each cluster in parallel.
|
||||
//
|
||||
// It takes a matrix of data points and a matrix of centers as input.
|
||||
// The best representative is the data point that is closest to the center of the cluster.
|
||||
// Returns an array of integers containing the index of the best representative for each cluster.
|
||||
func KmeansBestRepresentative(data *obiutils.Matrix[float64], centers *obiutils.Matrix[float64]) []int {
|
||||
bestRepresentative := make([]int, len(*centers))
|
||||
|
||||
var wg sync.WaitGroup
|
||||
wg.Add(len(*centers))
|
||||
|
||||
for j, center := range *centers {
|
||||
go func(j int, center []float64) {
|
||||
defer wg.Done()
|
||||
|
||||
bestDistToCenter := math.MaxFloat64
|
||||
best := -1
|
||||
|
||||
for i, row := range *data {
|
||||
dist := 0.0
|
||||
for d, val := range row {
|
||||
diff := val - center[d]
|
||||
dist += diff * diff
|
||||
}
|
||||
if dist < bestDistToCenter {
|
||||
bestDistToCenter = dist
|
||||
best = i
|
||||
}
|
||||
}
|
||||
|
||||
if best == -1 {
|
||||
log.Fatalf("No representative found for cluster %d", j)
|
||||
}
|
||||
|
||||
bestRepresentative[j] = best
|
||||
}(j, center)
|
||||
for i := 0; i < k; i++ {
|
||||
wg.Add(1)
|
||||
go goroutine2(i)
|
||||
}
|
||||
|
||||
wg.Wait()
|
||||
|
||||
return bestRepresentative
|
||||
}
|
||||
|
||||
func (clustering *KmeansClustering) Inertia() float64 {
|
||||
inertia := 0.0
|
||||
|
||||
for i := 0; i < clustering.N(); i++ {
|
||||
inertia += clustering.distmin[i]
|
||||
}
|
||||
return inertia
|
||||
}
|
||||
|
||||
func (clustering *KmeansClustering) Centers() obiutils.Matrix[float64] {
|
||||
return clustering.centers
|
||||
}
|
||||
|
||||
func (clustering *KmeansClustering) CentersIndices() []int {
|
||||
return clustering.icenters
|
||||
}
|
||||
|
||||
func (clustering *KmeansClustering) Sizes() []int {
|
||||
return clustering.sizes
|
||||
}
|
||||
|
||||
func (clustering *KmeansClustering) Classes() []int {
|
||||
return clustering.classes
|
||||
}
|
||||
|
||||
func (clustering *KmeansClustering) Run(max_cycle int, threshold float64) bool {
|
||||
prev := math.MaxFloat64
|
||||
newI := clustering.Inertia()
|
||||
for i := 0; i < max_cycle && (prev-newI) > threshold; i++ {
|
||||
prev = newI
|
||||
clustering.AssignToClass()
|
||||
clustering.ComputeCenters()
|
||||
newI = clustering.Inertia()
|
||||
}
|
||||
|
||||
return (prev - newI) <= threshold
|
||||
}
|
||||
|
||||
// // Kmeans performs the K-means clustering algorithm on the given data.
|
||||
|
||||
// // if centers and *center is not nil, centers is considered as initialized
|
||||
// // and the number of classes (k) is set to the number of rows in centers.
|
||||
// // overwise, the number of classes is defined by the value of k.
|
||||
|
||||
// // Parameters:
|
||||
// // - data: A pointer to a Matrix[float64] that represents the input data.
|
||||
// // - k: An integer that specifies the number of clusters to create.
|
||||
// // - threshold: A float64 value that determines the convergence threshold.
|
||||
// // - centers: A pointer to a Matrix[float64] that represents the initial cluster centers.
|
||||
|
||||
// // Returns:
|
||||
// // - classes: A slice of integers that assigns each data point to a cluster.
|
||||
// // - centers: A pointer to a Matrix[float64] that contains the final cluster centers.
|
||||
// // - inertia: A float64 value that represents the overall inertia of the clustering.
|
||||
// // - converged: A boolean value indicating whether the algorithm converged.
|
||||
// func Kmeans(data *obiutils.Matrix[float64],
|
||||
// k int,
|
||||
// threshold float64,
|
||||
// centers *obiutils.Matrix[float64]) ([]int, *obiutils.Matrix[float64], float64, bool) {
|
||||
// if centers == nil || *centers == nil {
|
||||
// *centers = obiutils.Make2DArray[float64](k, len((*data)[0]))
|
||||
// center_ids := SampleIntWithoutReplacement(k, len(*data))
|
||||
// for i, id := range center_ids {
|
||||
// (*centers)[i] = (*data)[id]
|
||||
// }
|
||||
// } else {
|
||||
// k = len(*centers)
|
||||
// }
|
||||
|
||||
// classes := AssignToClass(data, centers)
|
||||
// centers = ComputeCenters(data, k, classes)
|
||||
// inertia := ComputeInertia(data, classes, centers)
|
||||
// delta := threshold * 100.0
|
||||
// for i := 0; i < 100 && delta > threshold; i++ {
|
||||
// classes = AssignToClass(data, centers)
|
||||
// centers = ComputeCenters(data, k, classes)
|
||||
// newi := ComputeInertia(data, classes, centers)
|
||||
// delta = inertia - newi
|
||||
// inertia = newi
|
||||
// log.Debugf("Inertia: %f, delta: %f", inertia, delta)
|
||||
// }
|
||||
|
||||
// return classes, centers, inertia, delta < threshold
|
||||
// }
|
||||
|
||||
// // KmeansBestRepresentative finds the best representative among the data point of each cluster in parallel.
|
||||
// //
|
||||
// // It takes a matrix of data points and a matrix of centers as input.
|
||||
// // The best representative is the data point that is closest to the center of the cluster.
|
||||
// // Returns an array of integers containing the index of the best representative for each cluster.
|
||||
// func KmeansBestRepresentative(data *obiutils.Matrix[float64], centers *obiutils.Matrix[float64]) []int {
|
||||
// bestRepresentative := make([]int, len(*centers))
|
||||
|
||||
// var wg sync.WaitGroup
|
||||
// wg.Add(len(*centers))
|
||||
|
||||
// for j, center := range *centers {
|
||||
// go func(j int, center []float64) {
|
||||
// defer wg.Done()
|
||||
|
||||
// bestDistToCenter := math.MaxFloat64
|
||||
// best := -1
|
||||
|
||||
// for i, row := range *data {
|
||||
// dist := 0.0
|
||||
// for d, val := range row {
|
||||
// diff := val - center[d]
|
||||
// dist += diff * diff
|
||||
// }
|
||||
// if dist < bestDistToCenter {
|
||||
// bestDistToCenter = dist
|
||||
// best = i
|
||||
// }
|
||||
// }
|
||||
|
||||
// if best == -1 {
|
||||
// log.Fatalf("No representative found for cluster %d", j)
|
||||
// }
|
||||
|
||||
// bestRepresentative[j] = best
|
||||
// }(j, center)
|
||||
// }
|
||||
|
||||
// wg.Wait()
|
||||
|
||||
// return bestRepresentative
|
||||
// }
|
||||
|
64
pkg/obitable/table.go
Normal file
64
pkg/obitable/table.go
Normal file
@ -0,0 +1,64 @@
|
||||
// obitable provide a row oriented data table structure
|
||||
package obitable
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
|
||||
"github.com/chen3feng/stl4go"
|
||||
)
|
||||
|
||||
type Header stl4go.Ordered
|
||||
|
||||
type Row map[string]interface{}
|
||||
type Table struct {
|
||||
ColType map[string]reflect.Type
|
||||
Rows []Row
|
||||
}
|
||||
|
||||
type RowGetter func(name string) interface{}
|
||||
|
||||
func RowFromMap(data map[string]interface{}, navalue string) RowGetter {
|
||||
getter := func(name string) interface{} {
|
||||
value, ok := data[name]
|
||||
|
||||
if !ok {
|
||||
value = navalue
|
||||
}
|
||||
|
||||
return value
|
||||
}
|
||||
|
||||
return getter
|
||||
}
|
||||
|
||||
func RowFromBioSeq(data *obiseq.BioSequence, navalue string) RowGetter {
|
||||
getter := func(name string) interface{} {
|
||||
var value interface{}
|
||||
value = navalue
|
||||
|
||||
switch name {
|
||||
case "id":
|
||||
value = data.Id()
|
||||
case "sequence":
|
||||
value = data.Sequence()
|
||||
case "definition":
|
||||
value = data.Definition()
|
||||
case "taxid":
|
||||
value = data.Taxid()
|
||||
case "count":
|
||||
value = data.Count()
|
||||
default:
|
||||
if data.HasAnnotation() {
|
||||
var ok bool
|
||||
value, ok = data.GetAttribute(name)
|
||||
if !ok {
|
||||
value = navalue
|
||||
}
|
||||
}
|
||||
}
|
||||
return value
|
||||
}
|
||||
|
||||
return getter
|
||||
}
|
@ -124,6 +124,9 @@ func CLISelectLandmarkSequences(iterator obiiter.IBioSequence) obiiter.IBioSeque
|
||||
}
|
||||
|
||||
// classes, centers := obistats.Kmeans(&seqworld, n_landmark, &initialCenters)
|
||||
classifier := obistats.MakeKmeansClustering(&seqworld, n_landmark, obistats.DefaultRG())
|
||||
_, centers, inertia, converged := classifier.Run(1000, 0.001)
|
||||
intertia := classifier.Inertia()
|
||||
_, centers, inertia, converged := obistats.Kmeans(&seqworld, n_landmark, 0.001, &initialCenters)
|
||||
|
||||
dist_centers := 0.0
|
||||
|
BIN
sample/.DS_Store
vendored
BIN
sample/.DS_Store
vendored
Binary file not shown.
11
sample/AY189646
Normal file
11
sample/AY189646
Normal file
@ -0,0 +1,11 @@
|
||||
==> db_v05_idx_ori.fasta <==
|
||||
>AY189646 {"count":1,"obitag_ref_index":{"0":"9606@Homo sapiens@species","10":"9526@Catarrhini@parvorder","12":"1437010@Boreoeutheria@clade","16":"9347@Eutheria@clade","17":"40674@Mammalia@class","2":"207598@Homininae@subfamily","22":"117571@Euteleostomi@clade","25":"7776@Gnathostomata@clade","29":"33213@Bilateria@clade","3":"9604@Hominidae@family","30":"6072@Eumetazoa@clade","8":"314295@Hominoidea@superfamily"},"species_name":"Homo sapiens","taxid":9606} Homo sapiens clone arCan119 12S ribosomal RNA gene, partial sequence; mitochondrial gene for mitochondrial product.
|
||||
ttagccctaaacctcaacagttaaatcaacaaaactgctcgccagaacactacgrgccac
|
||||
agcttaaaactcaaaggacctggcggtgcttcatatccct
|
||||
>AY189647 {"count":1,"obitag_ref_index":{"0":"9606@Homo sapiens@species","11":"314295@Hominoidea@superfamily","12":"9526@Catarrhini@parvorder","15":"1437010@Boreoeutheria@clade","18":"9347@Eutheria@clade","19":"40674@Mammalia@class","21":"117571@Euteleostomi@clade","25":"7776@Gnathostomata@clade","30":"33213@Bilateria@clade","31":"6072@Eumetazoa@clade","5":"207598@Homininae@subfamily","6":"9604@Hominidae@family"},"species_name":"Homo sapiens","taxid":9606} Homo sapiens clone arCan120 12S ribosomal RNA gene, partial sequence; mitochondrial gene for mitochondrial product.
|
||||
ttagccctaaacctcaacagttaaatcaacaaaaacwsctcgccagaacactacgagtca
|
||||
cagcttaaaactcaaaggacctggcggtgcttcatatccct
|
||||
>AC008434 {"count":94,"obitag_ref_index":{"0":"207598@Homininae@subfamily","12":"1437010@Boreoeutheria@clade","16":"40674@Mammalia@class","2":"9604@Hominidae@family","21":"117571@Euteleostomi@clade","23":"7776@Gnathostomata@clade","28":"33213@Bilateria@clade","29":"6072@Eumetazoa@clade","7":"314295@Hominoidea@superfamily","9":"9526@Catarrhini@parvorder"},"species_name":"###","taxid":207598} Homo sapiens chromosome 5 clone CTC-325J23, complete sequence.
|
||||
ttagccctaaacttcaacagttaaattaacaaaactgctcgccagaacactacgagccac
|
||||
agcttaaaactcaaaggacctggcggtgcttcatatccct
|
||||
>AC008512 {"count":5,"obitag_ref_index":{"0":"9606@Homo sapiens@species","10":"1437010@Boreoeutheria@clade","14":"40674@Mammalia@class","18":"117571@Euteleostomi@clade","2":"207598@Homininae@subfamily","22":"7776@Gnathostomata@clade","26":"33213@Bilateria@clade","28":"6072@Eumetazoa@clade","7":"9526@Catarrhini@parvorder"},"species_name":"Homo sapiens","taxid":9606} Homo sapiens chromosome 5 clone CTC-454K8, complete sequence.
|
91072
sample/Euka05_230327.xxx
Normal file
91072
sample/Euka05_230327.xxx
Normal file
File diff suppressed because it is too large
Load Diff
91072
sample/Euka05_230327_frg.xxx
Normal file
91072
sample/Euka05_230327_frg.xxx
Normal file
File diff suppressed because it is too large
Load Diff
11
sample/FJ465692
Normal file
11
sample/FJ465692
Normal file
@ -0,0 +1,11 @@
|
||||
==> db_v05_idx_ori.fasta <==
|
||||
>AY189646 {"count":1,"obitag_ref_index":{"0":"9606@Homo sapiens@species","10":"9526@Catarrhini@parvorder","12":"1437010@Boreoeutheria@clade","16":"9347@Eutheria@clade","17":"40674@Mammalia@class","2":"207598@Homininae@subfamily","22":"117571@Euteleostomi@clade","25":"7776@Gnathostomata@clade","29":"33213@Bilateria@clade","3":"9604@Hominidae@family","30":"6072@Eumetazoa@clade","8":"314295@Hominoidea@superfamily"},"species_name":"Homo sapiens","taxid":9606} Homo sapiens clone arCan119 12S ribosomal RNA gene, partial sequence; mitochondrial gene for mitochondrial product.
|
||||
ttagccctaaacctcaacagttaaatcaacaaaactgctcgccagaacactacgrgccac
|
||||
agcttaaaactcaaaggacctggcggtgcttcatatccct
|
||||
>AY189647 {"count":1,"obitag_ref_index":{"0":"9606@Homo sapiens@species","11":"314295@Hominoidea@superfamily","12":"9526@Catarrhini@parvorder","15":"1437010@Boreoeutheria@clade","18":"9347@Eutheria@clade","19":"40674@Mammalia@class","21":"117571@Euteleostomi@clade","25":"7776@Gnathostomata@clade","30":"33213@Bilateria@clade","31":"6072@Eumetazoa@clade","5":"207598@Homininae@subfamily","6":"9604@Hominidae@family"},"species_name":"Homo sapiens","taxid":9606} Homo sapiens clone arCan120 12S ribosomal RNA gene, partial sequence; mitochondrial gene for mitochondrial product.
|
||||
ttagccctaaacctcaacagttaaatcaacaaaaacwsctcgccagaacactacgagtca
|
||||
cagcttaaaactcaaaggacctggcggtgcttcatatccct
|
||||
>AC008434 {"count":94,"obitag_ref_index":{"0":"207598@Homininae@subfamily","12":"1437010@Boreoeutheria@clade","16":"40674@Mammalia@class","2":"9604@Hominidae@family","21":"117571@Euteleostomi@clade","23":"7776@Gnathostomata@clade","28":"33213@Bilateria@clade","29":"6072@Eumetazoa@clade","7":"314295@Hominoidea@superfamily","9":"9526@Catarrhini@parvorder"},"species_name":"###","taxid":207598} Homo sapiens chromosome 5 clone CTC-325J23, complete sequence.
|
||||
ttagccctaaacttcaacagttaaattaacaaaactgctcgccagaacactacgagccac
|
||||
agcttaaaactcaaaggacctggcggtgcttcatatccct
|
||||
>AC008512 {"count":5,"obitag_ref_index":{"0":"9606@Homo sapiens@species","10":"1437010@Boreoeutheria@clade","14":"40674@Mammalia@class","18":"117571@Euteleostomi@clade","2":"207598@Homininae@subfamily","22":"7776@Gnathostomata@clade","26":"33213@Bilateria@clade","28":"6072@Eumetazoa@clade","7":"9526@Catarrhini@parvorder"},"species_name":"Homo sapiens","taxid":9606} Homo sapiens chromosome 5 clone CTC-454K8, complete sequence.
|
BIN
sample/STD_PLN_1.dat.gz
Normal file
BIN
sample/STD_PLN_1.dat.gz
Normal file
Binary file not shown.
BIN
sample/amea1_basecalling_Dorado_HAC_duplex_passANDfail.fastq.gz
Normal file
BIN
sample/amea1_basecalling_Dorado_HAC_duplex_passANDfail.fastq.gz
Normal file
Binary file not shown.
BIN
sample/heap.out
Normal file
BIN
sample/heap.out
Normal file
Binary file not shown.
0
sample/heap.out2
Normal file
0
sample/heap.out2
Normal file
15841
sample/xx.header
Normal file
15841
sample/xx.header
Normal file
File diff suppressed because it is too large
Load Diff
15841
sample/xx.sort.header
Normal file
15841
sample/xx.sort.header
Normal file
File diff suppressed because it is too large
Load Diff
5805
sample/yy.header
Normal file
5805
sample/yy.header
Normal file
File diff suppressed because it is too large
Load Diff
5805
sample/yy.sort.header
Normal file
5805
sample/yy.sort.header
Normal file
File diff suppressed because it is too large
Load Diff
537
test.go
Normal file
537
test.go
Normal file
@ -0,0 +1,537 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"fmt"
|
||||
"io"
|
||||
"log"
|
||||
"regexp"
|
||||
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obioptions"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
|
||||
"github.com/gabriel-vasile/mimetype"
|
||||
"golang.org/x/exp/slices"
|
||||
)
|
||||
|
||||
func OBIMimeTypeGuesser(stream io.Reader) (*mimetype.MIME, io.Reader, error) {
|
||||
fastaDetector := func(raw []byte, limit uint32) bool {
|
||||
ok, err := regexp.Match("^>[^ ]", raw)
|
||||
return ok && err == nil
|
||||
}
|
||||
|
||||
fastqDetector := func(raw []byte, limit uint32) bool {
|
||||
ok, err := regexp.Match("^@[^ ]", raw)
|
||||
return ok && err == nil
|
||||
}
|
||||
|
||||
ecoPCR2Detector := func(raw []byte, limit uint32) bool {
|
||||
ok := bytes.HasPrefix(raw, []byte("#@ecopcr-v2"))
|
||||
return ok
|
||||
}
|
||||
|
||||
genbankDetector := func(raw []byte, limit uint32) bool {
|
||||
ok2 := bytes.HasPrefix(raw, []byte("LOCUS "))
|
||||
ok1, err := regexp.Match("^[^ ]* +Genetic Sequence Data Bank *\n", raw)
|
||||
return ok2 || (ok1 && err == nil)
|
||||
}
|
||||
|
||||
emblDetector := func(raw []byte, limit uint32) bool {
|
||||
ok := bytes.HasPrefix(raw, []byte("ID "))
|
||||
return ok
|
||||
}
|
||||
|
||||
mimetype.Lookup("text/plain").Extend(fastaDetector, "text/fasta", ".fasta")
|
||||
mimetype.Lookup("text/plain").Extend(fastqDetector, "text/fastq", ".fastq")
|
||||
mimetype.Lookup("text/plain").Extend(ecoPCR2Detector, "text/ecopcr2", ".ecopcr")
|
||||
mimetype.Lookup("text/plain").Extend(genbankDetector, "text/genbank", ".seq")
|
||||
mimetype.Lookup("text/plain").Extend(emblDetector, "text/embl", ".dat")
|
||||
|
||||
// Create a buffer to store the read data
|
||||
buf := make([]byte, 1024*128)
|
||||
n, err := stream.Read(buf)
|
||||
|
||||
if err != nil && err != io.EOF {
|
||||
return nil, nil, err
|
||||
}
|
||||
|
||||
// Detect the MIME type using the mimetype library
|
||||
mimeType := mimetype.Detect(buf)
|
||||
if mimeType == nil {
|
||||
return nil, nil, err
|
||||
}
|
||||
|
||||
// Create a new reader based on the read data
|
||||
newReader := io.MultiReader(bytes.NewReader(buf[:n]), stream)
|
||||
|
||||
return mimeType, newReader, nil
|
||||
}
|
||||
|
||||
var xxx1 = `00422_612GNAAXX:7:73:6614:3284#0/1
|
||||
ccgaatatcttagataccccactatgcttagccctaaacataaacattcaataaacaaga
|
||||
atgttcgccagagtactactagcaacagcctgaaactcaaagcacttg
|
||||
>HELIUM_000100422_612GNAAXX:7:13:11063:8138#0/1
|
||||
ccgcctcctttagataccccactatgcttagccctaaacacaagtaattattataacaaa
|
||||
attattcgccagagtactaccggcaatagcttaaaactcacagaactt
|
||||
>HELIUM_000100422_612GNAAXX:7:2:7990:17026#0/1
|
||||
ccgaatatctagaacaggctcctctagagggatgtaaagcaccgccaagtcctttgcttt
|
||||
tcaggctgttgctagtagtactctggcgaccattcttgtttattgatt
|
||||
>HELIUM_000100422_612GNAAXX:7:3:19649:11224#0/1
|
||||
ccgaatatctagaacaggctcctctagagggatgtaaagcaccgccaagtcctttgagtt
|
||||
tcaggctgttgctagtagtactctggcgaacattcttgtttattgaat
|
||||
>HELIUM_000100422_612GNAAXX:7:3:8446:7884#0/1
|
||||
ccgaatatctagaacaggctcctctagagggatgtaaagcaccgccaagtcctttgagtt
|
||||
tcaggctgttgctagtagtactctggcgaacattcttgtttattgaat
|
||||
>HELIUM_000100422_612GNAAXX:7:108:8714:2464#0/1
|
||||
ccgcctcctttagataccccactatgcttagccctaaacacaagtaattaatataacaaa
|
||||
attattcgccagagtactaccggcaatagcttaaaactcaaaggactt
|
||||
>HELIUM_000100422_612GNAAXX:7:28:3969:15209#0/1
|
||||
ccaattaacttagataccccactatgcctagccttaaacacaaatagttatgcaaacaaa
|
||||
actattcgccagagtactaccggcaatagcttaaaactcaacgcactg
|
||||
>HELIUM_000100422_612GNAAXX:7:44:3269:3608#0/1
|
||||
gaagtagtagaacaggctcctctagaagggt`
|
||||
|
||||
var xxx2 = `>HELIUM_000100422_612GNAAXX:7:13:11063:8138#0/1
|
||||
ccgcctcctttagataccccactatgcttagccctaaacacaagtaattattataacaaa
|
||||
attattcgccagagtactaccggcaatagcttaaaactcacagaactt
|
||||
>HELIUM_000100422_612GNAAXX:7:2:7990:17026#0/1
|
||||
ccgaatatctagaacaggctcctctagagggatgtaaagcaccgccaagtcctttgcttt
|
||||
tcaggctgttgctagtagtactctggcgaccattcttgtttattgatt
|
||||
>HELIUM_000100422_612GNAAXX:7:3:19649:11224#0/1
|
||||
ccgaatatctagaacaggctcctctagagggatgtaaagcaccgccaagtcctttgagtt
|
||||
tcaggctgttgctagtagtactctggcgaacattcttgtttattgaat
|
||||
>HELIUM_000100422_612GNAAXX:7:3:8446:7884#0/1
|
||||
ccgaatatctagaacaggctcctctagagggatgtaaagcaccgccaagtcctttgagtt
|
||||
tcaggctgttgctagtagtactctggcgaacattcttgtttattgaat
|
||||
>HELIUM_000100422_612GNAAXX:7:108:8714:2464#0/1
|
||||
ccgcctcctttagataccccactatgcttagccctaaacacaagtaattaatataacaaa
|
||||
attattcgccagagtactaccggcaatagcttaaaactcaaaggactt
|
||||
>HELIUM_000100422_612GNAAXX:7:28:3969:15209#0/1
|
||||
ccaattaacttagataccccactatgcctagccttaaacacaaatagttatgcaaacaaa
|
||||
actattcgccagagtactaccggcaatagcttaaaactcaacgcactg
|
||||
>HELIUM_000100422_612GNAAXX:7:44:3269:3608#0/1
|
||||
gaagtagtagaacaggctcctctagaagggt`
|
||||
|
||||
var xxx3 = `00422_612GNAAXX:7:73:6614:3284#0/1
|
||||
ccgaatatcttagataccccactatgcttagccctaaacataaacattcaataaacaaga
|
||||
atgttcgccagagtactactagcaacagcctgaaactcaaagcacttg
|
||||
>HELIUM_000100422_612GNAAXX:7:13:11063:8138#0/1
|
||||
ccgcctcctttagataccccactatgcttagccctaaacacaagtaattattataacaaa
|
||||
attattcgccagagtactaccggcaatagcttaaaactcacagaactt
|
||||
>HELIUM_000100422_612GNAAXX:7:2:7990:17026#0/1
|
||||
ccgaatatctagaacaggctcctctagagggatgtaaagcaccgccaagtcctttgcttt
|
||||
tcaggctgttgctagtagtactctggcgaccattcttgtttattgatt
|
||||
>HELIUM_000100422_612GNAAXX:7:3:19649:11224#0/1
|
||||
ccgaatatctagaacaggctcctctagagggatgtaaagcaccgccaagtcctttgagtt
|
||||
tcaggctgttgctagtagtactctggcgaacattcttgtttattgaat
|
||||
>HELIUM_000100422_612GNAAXX:7:3:8446:7884#0/1
|
||||
ccgaatatctagaacaggctcctctagagggatgtaaagcaccgccaagtcctttgagtt
|
||||
tcaggctgttgctagtagtactctggcgaacattcttgtttattgaat
|
||||
>HELIUM_000100422_612GNAAXX:7:108:8714:2464#0/1
|
||||
ccgcctcctttagataccccactatgcttagccctaaacacaagtaattaatataacaaa
|
||||
attattcgccagagtactaccggcaatagcttaaaactcaaaggactt
|
||||
>HELIUM_000100422_612GNAAXX:7:28:3969:15209#0/1
|
||||
ccaattaacttagataccccactatgcctagccttaaacacaaatagttatgcaaacaaa
|
||||
actattcgccagagtactaccggcaatagcttaaaactcaacgcactg`
|
||||
|
||||
var yyy1 = `@HELIUM_000100422_612GNAAXX:7:1:9007:3289#0/1 {"demultiplex_error":"cannot assign the sequence to a sample"}
|
||||
ccatctctcttagataccccactatgcttagccctaaacacaagtaattaatataacaaaattattcgccagagtactaccggcaatagcttaaaactcaaagaactc
|
||||
+
|
||||
CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCBCCACCCCCCCCCBCCACC779?############################################
|
||||
@HELIUM_000100422_612GNAAXX:7:1:8849:9880#0/1 {"demultiplex_error":"cannot match any primer pair"}
|
||||
gatcggaagagcggttcagcaggaatgccgagaccgatatcgtatgccgtcttctgcttgaaaaaaaaaacaaaataggagagtagactcactgccagtggtcgtcag
|
||||
`
|
||||
|
||||
func LastFastqCut(buffer []byte) ([]byte, []byte) {
|
||||
imax := len(buffer)
|
||||
cut := imax
|
||||
state := 0
|
||||
restart := imax - 1
|
||||
for i := restart; i >= 0 && state < 7; i-- {
|
||||
C := buffer[i]
|
||||
is_end_of_line := C == '\r' || C == '\n'
|
||||
is_space := C == ' ' || C == '\t'
|
||||
is_sep := is_space || is_end_of_line
|
||||
|
||||
switch state {
|
||||
case 0:
|
||||
if C == '+' {
|
||||
// Potential start of quality part step 1
|
||||
state = 1
|
||||
restart = i
|
||||
}
|
||||
case 1:
|
||||
if is_end_of_line {
|
||||
// Potential start of quality part step 2
|
||||
state = 2
|
||||
} else {
|
||||
// it was not the start of quality part
|
||||
state = 0
|
||||
i = restart
|
||||
}
|
||||
case 2:
|
||||
if is_sep {
|
||||
// Potential start of quality part step 2 (stay in the same state)
|
||||
state = 2
|
||||
} else if (C >= 'a' && C <= 'z') || C == '-' || C == '.' {
|
||||
// End of the sequence
|
||||
state = 3
|
||||
} else {
|
||||
// it was not the start of quality part
|
||||
state = 0
|
||||
i = restart
|
||||
}
|
||||
case 3:
|
||||
if is_end_of_line {
|
||||
// Entrering in the header line
|
||||
state = 4
|
||||
} else if (C >= 'a' && C <= 'z') || C == '-' || C == '.' {
|
||||
// progressing along of the sequence
|
||||
state = 3
|
||||
} else {
|
||||
// it was not the sequence part
|
||||
state = 0
|
||||
i = restart
|
||||
}
|
||||
case 4:
|
||||
if is_end_of_line {
|
||||
state = 4
|
||||
} else {
|
||||
state = 5
|
||||
}
|
||||
case 5:
|
||||
if is_end_of_line {
|
||||
// It was not the header line
|
||||
state = 0
|
||||
i = restart
|
||||
} else if C == '@' {
|
||||
state = 6
|
||||
cut = i
|
||||
}
|
||||
case 6:
|
||||
if is_end_of_line {
|
||||
state = 7
|
||||
} else {
|
||||
state = 0
|
||||
i = restart
|
||||
}
|
||||
}
|
||||
}
|
||||
if state == 7 {
|
||||
return buffer[:cut], bytes.Clone(buffer[cut:])
|
||||
}
|
||||
return []byte{}, buffer
|
||||
}
|
||||
|
||||
func LastSequenceCut(buffer []byte) ([]byte, []byte) {
|
||||
imax := len(buffer)
|
||||
last := 0
|
||||
state := 0
|
||||
for i := imax - 1; i >= 0 && state < 2; i-- {
|
||||
if state == 0 && buffer[i] == '>' {
|
||||
state = 1
|
||||
last = i
|
||||
} else if state == 1 && (buffer[i] == '\r' || buffer[i] == '\n') {
|
||||
state = 2
|
||||
} else {
|
||||
state = 0
|
||||
}
|
||||
}
|
||||
|
||||
if state == 2 {
|
||||
return buffer[:last], bytes.Clone(buffer[last:])
|
||||
}
|
||||
return []byte{}, buffer
|
||||
}
|
||||
|
||||
func FirstSequenceCut(buffer []byte) ([]byte, []byte) {
|
||||
imax := len(buffer)
|
||||
last := 0
|
||||
state := 0
|
||||
for i := 0; i < imax && state < 2; i++ {
|
||||
if (state == 0 || state == 1) && (buffer[i] == '\r' || buffer[i] == '\n') {
|
||||
state = 1
|
||||
} else if (state == 1 || i == 0) && buffer[i] == '>' {
|
||||
state = 2
|
||||
last = i
|
||||
} else {
|
||||
state = 0
|
||||
}
|
||||
}
|
||||
|
||||
if state == 2 {
|
||||
return bytes.Clone(buffer[:last]), buffer[last:]
|
||||
}
|
||||
return buffer, []byte{}
|
||||
|
||||
}
|
||||
|
||||
func FullSequenceCut(buffer []byte) ([]byte, []byte, []byte) {
|
||||
before, buffer := FirstSequenceCut(buffer)
|
||||
|
||||
if len(buffer) == 0 {
|
||||
return before, []byte{}, []byte{}
|
||||
}
|
||||
|
||||
buffer, after := LastSequenceCut(buffer)
|
||||
return before, buffer, after
|
||||
}
|
||||
|
||||
func Concatenate[S ~[]E, E any](s1, s2 S) S {
|
||||
if len(s1) > 0 {
|
||||
if len(s2) > 0 {
|
||||
return append(s1[:len(s1):len(s1)], s2...)
|
||||
}
|
||||
return s1
|
||||
}
|
||||
return s2
|
||||
}
|
||||
|
||||
type FastxChunk struct {
|
||||
Bytes []byte
|
||||
index int
|
||||
}
|
||||
|
||||
func FastaChunkReader(r io.Reader, size int, cutHead bool) (chan FastxChunk, error) {
|
||||
out := make(chan FastxChunk)
|
||||
buff := make([]byte, size)
|
||||
|
||||
n, err := r.Read(buff)
|
||||
if n > 0 && err == nil {
|
||||
if n < size {
|
||||
buff = buff[:n]
|
||||
}
|
||||
|
||||
begin, buff := FirstSequenceCut(buff)
|
||||
|
||||
if len(begin) > 0 && !cutHead {
|
||||
return out, fmt.Errorf("begin is not empty : %s", string(begin))
|
||||
}
|
||||
|
||||
go func(buff []byte) {
|
||||
idx := 0
|
||||
end := []byte{}
|
||||
|
||||
for err == nil && n > 0 {
|
||||
// fmt.Println("============end=========================")
|
||||
// fmt.Println(string(end))
|
||||
// fmt.Println("------------buff------------------------")
|
||||
// fmt.Println(string(buff))
|
||||
buff = Concatenate(end, buff)
|
||||
// fmt.Println("------------buff--pasted----------------")
|
||||
// fmt.Println(string(buff))
|
||||
buff, end = LastSequenceCut(buff)
|
||||
// fmt.Println("----------------buff--cutted------------")
|
||||
// fmt.Println(string(buff))
|
||||
// fmt.Println("------------------end-------------------")
|
||||
// fmt.Println(string(end))
|
||||
// fmt.Println("========================================")
|
||||
if len(buff) > 0 {
|
||||
out <- FastxChunk{
|
||||
Bytes: bytes.Clone(buff),
|
||||
index: idx,
|
||||
}
|
||||
idx++
|
||||
} else {
|
||||
fmt.Println("***** Empty buff *****")
|
||||
}
|
||||
|
||||
buff = slices.Grow(buff[:0], size)[0:size]
|
||||
n, err = r.Read(buff)
|
||||
if n < size {
|
||||
buff = buff[:n]
|
||||
}
|
||||
// fmt.Printf("n = %d, err = %v\n", n, err)
|
||||
}
|
||||
|
||||
if len(end) > 0 {
|
||||
out <- FastxChunk{
|
||||
Bytes: bytes.Clone(end),
|
||||
index: idx,
|
||||
}
|
||||
}
|
||||
|
||||
close(out)
|
||||
}(buff)
|
||||
}
|
||||
|
||||
return out, nil
|
||||
}
|
||||
|
||||
func ParseFastaChunk(ch FastxChunk) *obiiter.BioSequenceBatch {
|
||||
fmt.Println(string(ch.Bytes))
|
||||
slice := make(obiseq.BioSequenceSlice, 0, obioptions.CLIBatchSize())
|
||||
|
||||
state := 0
|
||||
start := 0
|
||||
current := 0
|
||||
var identifier string
|
||||
var definition string
|
||||
|
||||
for i := 0; i < len(ch.Bytes); i++ {
|
||||
C := ch.Bytes[i]
|
||||
is_end_of_line := C == '\r' || C == '\n'
|
||||
is_space := C == ' ' || C == '\t'
|
||||
is_sep := is_space || is_end_of_line
|
||||
|
||||
switch state {
|
||||
case 0:
|
||||
if C == '>' {
|
||||
// Beginning of sequence
|
||||
state = 1
|
||||
}
|
||||
case 1:
|
||||
if is_sep {
|
||||
// No identifier -> ERROR
|
||||
return nil
|
||||
} else {
|
||||
// Beginning of identifier
|
||||
state = 2
|
||||
start = i
|
||||
}
|
||||
case 2:
|
||||
if is_sep {
|
||||
// End of identifier
|
||||
identifier = string(ch.Bytes[start:i])
|
||||
state = 3
|
||||
}
|
||||
case 3:
|
||||
if is_end_of_line {
|
||||
// Definition empty
|
||||
definition = ""
|
||||
state = 5
|
||||
} else if !is_space {
|
||||
// Beginning of definition
|
||||
start = i
|
||||
state = 4
|
||||
}
|
||||
case 4:
|
||||
if is_end_of_line {
|
||||
definition = string(ch.Bytes[start:i])
|
||||
state = 5
|
||||
|
||||
}
|
||||
case 5:
|
||||
if !is_end_of_line {
|
||||
// Beginning of sequence
|
||||
start = i
|
||||
current = i
|
||||
state = 6
|
||||
}
|
||||
case 6:
|
||||
if C == '>' {
|
||||
// End of sequence
|
||||
s := obiseq.NewBioSequence(identifier, bytes.Clone(ch.Bytes[start:current]), definition)
|
||||
slice = append(slice, s)
|
||||
state = 1
|
||||
|
||||
} else if !is_sep {
|
||||
if C >= 'A' && C <= 'Z' {
|
||||
C = C + 'a' - 'A'
|
||||
}
|
||||
// Removing white space from the sequence
|
||||
if (C >= 'a' && C <= 'z') || C == '-' || C == '.' {
|
||||
ch.Bytes[current] = C
|
||||
current++
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fmt.Printf("Index = %d, State = %d\n", ch.index, state)
|
||||
slice = append(slice, obiseq.NewBioSequence(identifier, bytes.Clone(ch.Bytes[start:current]), definition))
|
||||
batch := obiiter.MakeBioSequenceBatch(ch.index, slice)
|
||||
return &batch
|
||||
}
|
||||
|
||||
func ReadFastaSequence(reader io.Reader) obiiter.IBioSequence {
|
||||
out := obiiter.MakeIBioSequence()
|
||||
|
||||
nworker := obioptions.CLIReadParallelWorkers()
|
||||
out.Add(nworker)
|
||||
|
||||
chkchan, err := FastaChunkReader(reader, 1024*500, false)
|
||||
|
||||
if err != nil {
|
||||
log.Panicln("Error:", err)
|
||||
}
|
||||
|
||||
go func() {
|
||||
out.WaitAndClose()
|
||||
}()
|
||||
|
||||
parser := func() {
|
||||
defer out.Done()
|
||||
for chk := range chkchan {
|
||||
seqs := ParseFastaChunk(chk)
|
||||
if seqs != nil {
|
||||
out.Push(*seqs)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for i := 0; i < nworker; i++ {
|
||||
go parser()
|
||||
}
|
||||
|
||||
return out.SortBatches().Rebatch(obioptions.CLIBatchSize())
|
||||
}
|
||||
|
||||
func main() {
|
||||
// if len(os.Args) != 2 {
|
||||
// fmt.Println("Usage: go run main.go <filename>")
|
||||
// return
|
||||
// }
|
||||
|
||||
// filename := os.Args[1]
|
||||
// filename := "100.fasta"
|
||||
// file, err := os.Open(filename)
|
||||
// if err != nil {
|
||||
// fmt.Println("Error:", err)
|
||||
// return
|
||||
// }
|
||||
// defer file.Close()
|
||||
|
||||
// mimeType, input, err := OBIMimeTypeGuesser(file)
|
||||
// if err != nil {
|
||||
// fmt.Println("Error:", err)
|
||||
// return
|
||||
// }
|
||||
|
||||
// fmt.Println("Detected MIME Type:", mimeType.String())
|
||||
|
||||
// ch, err := FastaChunkReader(input, 1024, false)
|
||||
|
||||
// if err != nil {
|
||||
// fmt.Println("Error:", err)
|
||||
// return
|
||||
// }
|
||||
|
||||
// for chk := range ch {
|
||||
// fmt.Printf("--------------------\n")
|
||||
// b := ParseFastaChunk(chk)
|
||||
// fmt.Printf("-------- %d --------\n", b.Order())
|
||||
// for _, b := range b.Slice() {
|
||||
// fmt.Printf("--%s--\t--%s--\t--%s--\n", b.Id(), b.Definition(), b.String())
|
||||
// }
|
||||
// }
|
||||
|
||||
d1, f1 := LastFastqCut([]byte(yyy1))
|
||||
// d2, f2 := LastSequenceCut([]byte(xxx2))
|
||||
// d3, f3 := LastSequenceCut([]byte(xxx3))
|
||||
|
||||
fmt.Println("Last Sequence Cut 1:", string(d1), "---", string(f1))
|
||||
// fmt.Println("Last Sequence Cut 2:", string(d2), "---", string(f2))
|
||||
// fmt.Println("Last Sequence Cut 3:", string(d3), "---", string(f3))
|
||||
|
||||
// d1, b1, f1 := FullSequenceCut([]byte(xxx1))
|
||||
// d2, b2, f2 := FullSequenceCut([]byte(xxx2))
|
||||
// d3, b3, f3 := FullSequenceCut([]byte(xxx3))
|
||||
|
||||
// fmt.Println("Last Sequence Cut:", string(d1), "---", string(b1), "---", string(f1))
|
||||
// fmt.Println("Last Sequence Cut:", string(d2), "---", string(b2), "---", string(f2))
|
||||
// fmt.Println("Last Sequence Cut:", string(d3), "---", string(b3), "---", string(f3))
|
||||
|
||||
// Now you can use "extractedData" to access the read data with the associated MIME type.
|
||||
// For example, you can copy the data into a buffer for further manipulation.
|
||||
}
|
1
titi.fasta.gz.REMOVED.git-id
Normal file
1
titi.fasta.gz.REMOVED.git-id
Normal file
@ -0,0 +1 @@
|
||||
061a528427b8ecb0b30df8e7923edc4220443ade
|
BIN
toto.csv.gz
Normal file
BIN
toto.csv.gz
Normal file
Binary file not shown.
BIN
wolf_R1.fasta.gz
Normal file
BIN
wolf_R1.fasta.gz
Normal file
Binary file not shown.
BIN
wolf_R2.fasta.gz
Normal file
BIN
wolf_R2.fasta.gz
Normal file
Binary file not shown.
6
wolf_count.csv
Normal file
6
wolf_count.csv
Normal file
@ -0,0 +1,6 @@
|
||||
id,HELIUM_000100422_612GNAAXX:7:100:4828:3492#0/1_sub[28..127],HELIUM_000100422_612GNAAXX:7:113:17236:15166#0/1_sub[28..126],HELIUM_000100422_612GNAAXX:7:16:12111:9453#0/1_sub[28..127],HELIUM_000100422_612GNAAXX:7:17:3675:13316#0/1_sub[28..126],HELIUM_000100422_612GNAAXX:7:20:15729:20493#0/1_sub[28..126],HELIUM_000100422_612GNAAXX:7:22:2603:18023#0/1_sub[28..127],HELIUM_000100422_612GNAAXX:7:25:11714:14251#0/1_sub[28..126],HELIUM_000100422_612GNAAXX:7:2:15508:17530#0/1_sub[28..126],HELIUM_000100422_612GNAAXX:7:30:17945:19531#0/1_sub[28..126],HELIUM_000100422_612GNAAXX:7:34:14122:13731#0/1_sub[28..126],HELIUM_000100422_612GNAAXX:7:34:17680:16952#0/1_sub[28..127],HELIUM_000100422_612GNAAXX:7:52:12776:11698#0/1_sub[28..126],HELIUM_000100422_612GNAAXX:7:53:17880:8617#0/1_sub[28..126],HELIUM_000100422_612GNAAXX:7:58:11419:17203#0/1_sub[28..126],HELIUM_000100422_612GNAAXX:7:5:15939:5437#0/1_sub[28..126],HELIUM_000100422_612GNAAXX:7:66:4039:8016#0/1_sub[28..127],HELIUM_000100422_612GNAAXX:7:69:15276:10367#0/1_sub[28..127],HELIUM_000100422_612GNAAXX:7:72:17638:8081#0/1_sub[28..126],HELIUM_000100422_612GNAAXX:7:7:18108:9040#0/1_sub[28..126],HELIUM_000100422_612GNAAXX:7:7:2880:4021#0/1_sub[28..127],HELIUM_000100422_612GNAAXX:7:80:10626:19388#0/1_sub[28..127],HELIUM_000100422_612GNAAXX:7:81:18704:12346#0/1_sub[28..126],HELIUM_000100422_612GNAAXX:7:84:14502:1617#0/1_sub[28..127],HELIUM_000100422_612GNAAXX:7:84:16335:5083#0/1_sub[28..126],HELIUM_000100422_612GNAAXX:7:94:16908:11285#0/1_sub[28..127],HELIUM_000100422_612GNAAXX:7:9:9623:15395#0/1_sub[28..127]
|
||||
15a_F730814,0,0,0,0,0,9165,0,0,0,0,0,0,0,5,0,0,0,0,4,0,0,0,0,0,0,0
|
||||
13a_F730603,22,0,1,0,0,0,0,0,0,0,15,0,0,19,0,25,0,0,9,0,20,0,0,0,8409,0
|
||||
29a_F260619,0,0,13,0,16,6139,0,0,0,0,0,0,0,1,0,0,25,0,0,44,0,0,391,110,0,353
|
||||
26a_F040644,0,72,0,17,0,0,14,18,43,31,0,52,88,481,12830,0,0,15,0,0,0,208,0,14,0,0
|
||||
|
|
4
wolf_diet_ngsfilter.txt
Executable file
4
wolf_diet_ngsfilter.txt
Executable file
@ -0,0 +1,4 @@
|
||||
wolf_diet 13a_F730603 aattaac TTAGATACCCCACTATGC TAGAACAGGCTCCTCTAG F @
|
||||
wolf_diet 13b_F730603 gaagtag TTAGATACCCCACTATGC TAGAACAGGCTCCTCTAG F @
|
||||
wolf_diet 26a_F040644 gaatatc TTAGATACCCCACTATGC TAGAACAGGCTCCTCTAG F @
|
||||
wolf_diet 29a_F260619 gcctcct TTAGATACCCCACTATGC TAGAACAGGCTCCTCTAG F @
|
1
xxx.gb.REMOVED.git-id
Normal file
1
xxx.gb.REMOVED.git-id
Normal file
@ -0,0 +1 @@
|
||||
b92f63b34879105e61db0faf868f2828b4db298d
|
1860
yyy_LCS.csv
Normal file
1860
yyy_LCS.csv
Normal file
File diff suppressed because it is too large
Load Diff
1104
yyy_geom.csv
Normal file
1104
yyy_geom.csv
Normal file
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user